push Deepspeed 0.6.3 rocm version

7d1a83a9 · aiss · ab5534fc · 7d1a83a9 · 7d1a83a9 · 7d1a83a9
Commit 7d1a83a9 authored May 25, 2022 by aiss
20 changed files
--- a/csrc/includes/compat.h
+++ b/csrc/includes/compat.h
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
--- a/csrc/includes/context_hip.h
+++ b/csrc/includes/context_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <ATen/hip/HIPContext.h>
+#include <hip/hip_runtime_api.h>
+#include <cassert>
+#include <iostream>
+#include <vector>
+#include "rocblas.h"
+#include "hip/hip_runtime.h"
+#include "hiprand/hiprand.h"
+#include "gemm_test_hip.h"
+#define WARP_SIZE 32
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        hipError_t error_code = callstr;                                                      \
+        if (error_code != hipSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
+        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
+#define DS_CUDA_NUM_THREADS 512
+#define DS_MAXIMUM_NUM_BLOCKS 262144
+inline int DS_GET_BLOCKS(const int N)
+{
+    return (std::max)(
+        (std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
+        // Use at least 1 block, since CUDA does not allow empty block
+        1);
+}
+class Context {
+public:
+    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
+    {
+        hiprandCreateGenerator(&_gen, HIPRAND_RNG_PSEUDO_DEFAULT);
+        hiprandSetPseudoRandomGeneratorSeed(_gen, 123);
+        if (rocblas_create_handle(&_cublasHandle) != rocblas_status_success) {
+            auto message = std::string("Fail to create cublas handle.");
+            std::cerr << message << std::endl;
+            throw std::runtime_error(message);
+        }
+    }
+    virtual ~Context()
+    {
+        rocblas_destroy_handle(_cublasHandle);
+        hipFree(_workspace);
+    }
+    static Context& Instance()
+    {
+        static Context _ctx;
+        return _ctx;
+    }
+    void SetWorkSpace(void* workspace)
+    {
+        if (!workspace) { throw std::runtime_error("Workspace is null."); }
+        _workspace = workspace;
+    }
+    void* GetWorkSpace() { return _workspace; }
+    hiprandGenerator_t& GetRandGenerator() { return _gen; }
+    hipStream_t GetCurrentStream()
+    {
+        // get current pytorch stream.
+        hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+        return stream;
+    }
+    hipStream_t GetNewStream() { return at::hip::getStreamFromPoolMasqueradingAsCUDA(); }
+    rocblas_handle GetCublasHandle() { return _cublasHandle; }
+    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
+    {
+        uint64_t offset = _curr_offset;
+        _curr_offset += offset_inc;
+        return std::pair<uint64_t, uint64_t>(_seed, offset);
+    }
+    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
+    void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head)
+    {
+        // avoid rerun.
+        if (_gemm_algos.size() > 0) return;
+        if (test_gemm) {
+            rocblas_handle handle = GetCublasHandle();
+            std::unique_ptr<GemmTest<__half>> test_qkv_fw(
+                new GemmTest<__half>(batch_size * seq_len,      // M
+                                     head_num * size_per_head,  // N
+                                     head_num * size_per_head,  // K
+                                     rocblas_operation_transpose,
+                                     rocblas_operation_none,
+                                     handle));
+            std::unique_ptr<GemmTest<__half>> test_inter(
+                new GemmTest<__half>(batch_size * seq_len,          // M
+                                     4 * head_num * size_per_head,  // N
+                                     head_num * size_per_head,      // K
+                                     rocblas_operation_transpose,
+                                     rocblas_operation_none,
+                                     handle));
+            std::unique_ptr<GemmTest<__half>> test_output(
+                new GemmTest<__half>(batch_size * seq_len,          // M
+                                     head_num * size_per_head,      // N
+                                     4 * head_num * size_per_head,  // K
+                                     rocblas_operation_transpose,
+                                     rocblas_operation_none,
+                                     handle));
+            std::unique_ptr<StridedGemmTest<__half>> test_attn_scores(
+                new StridedGemmTest<__half>(batch_size * head_num,  // batch
+                                            seq_len,                // M
+                                            seq_len,                // N
+                                            size_per_head,          // K
+                                            rocblas_operation_transpose,
+                                            rocblas_operation_none,
+                                            handle));
+            std::unique_ptr<StridedGemmTest<__half>> test_attn_context(
+                new StridedGemmTest<__half>(batch_size * head_num,  // batch
+                                            size_per_head,          // M
+                                            seq_len,                // N
+                                            seq_len,                // K
+                                            rocblas_operation_none,
+                                            rocblas_operation_none,
+                                            handle));
+            _gemm_algos.push_back(test_qkv_fw->TestAlgo(100));
+            _gemm_algos.push_back(test_inter->TestAlgo(100));
+            _gemm_algos.push_back(test_output->TestAlgo(100));
+            _gemm_algos.push_back(test_attn_scores->TestAlgo(100));
+            _gemm_algos.push_back(test_attn_context->TestAlgo(100));
+        } else {
+            // Use default algo.
+            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
+            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
+            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
+            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
+            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
+        }
+    }
+    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
+private:
+    hiprandGenerator_t _gen;
+    rocblas_handle _cublasHandle;
+    void* _workspace;
+    uint64_t _seed;
+    uint64_t _curr_offset;
+    std::vector<std::array<int, 3>> _gemm_algos;
+};
--- a/csrc/includes/cpu_adagrad.h
+++ b/csrc/includes/cpu_adagrad.h
+#pragma once
+#define NOMINMAX  // Windows idiosyncrasy
+                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+#include <cassert>
+#include "cuda.h"
+#include "custom_cuda_layers.h"
+#include "simd.h"
+#define STEP(SPAN)                                \
+    void Step_##SPAN(float* _params,              \
+                     float* grads,                \
+                     float* _exp_avg_sq,          \
+                     size_t _param_size,          \
+                     __half* dev_param = nullptr, \
+                     bool half_precision = false);
+class Adagrad_Optimizer {
+public:
+    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
+        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
+    {
+        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+        _streams[0] = Context::Instance().GetCurrentStream();
+        _streams[1] = Context::Instance().GetNewStream();
+    }
+    ~Adagrad_Optimizer()
+    {
+        cudaFreeHost(_doubled_buffer[0]);
+        cudaFreeHost(_doubled_buffer[1]);
+    }
+#if defined(__AVX512__) or defined(__AVX256__)
+    template <int span>
+    void Step_AVX(size_t* rounded_size,
+                  float* _params,
+                  float* grads,
+                  float* _exp_avg_sq,
+                  size_t param_size,
+                  __half* dev_param = nullptr,
+                  bool half_precision = false);
+#endif
+    STEP(1)
+    STEP(4)
+    STEP(8)
+    inline void SynchronizeStreams()
+    {
+        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
+    }
+    inline void IncrementStep(size_t step)
+    {
+        _step++;
+        if (_step != step) { _step = step; }
+    }
+    inline void update_state(float lr, float epsilon, float weight_decay)
+    {
+        _alpha = lr;
+        _eps = epsilon;
+        _weight_decay = weight_decay;
+    }
+private:
+    float _alpha;
+    float _eps;
+    float _weight_decay;
+    float _betta1_t;
+    float _betta2_t;
+    size_t _step;
+    float* _doubled_buffer[2];
+    bool _buf_index;
+    cudaStream_t _streams[2];
+};
+#if defined(__AVX512__) or defined(__AVX256__)
+template <int span>
+void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
+                                 float* _params,
+                                 float* grads,
+                                 float* _exp_avg_sq,
+                                 size_t _param_size,
+                                 __half* dev_params,
+                                 bool half_precision)
+{
+    size_t new_rounded_size = 0;
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+    float step_size = -1 * _alpha;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
+    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
+    for (size_t t = 0; t < new_rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
+        size_t offset = copy_size + t;
+        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
+            AVX_Data grad_4[span];
+            simd_load<span>(grad_4, grads + i, half_precision);
+            AVX_Data momentum_4[span];
+            simd_load<span>(momentum_4, grads + i, false);
+            AVX_Data variance_4[span];
+            simd_load<span>(variance_4, _exp_avg_sq + i, false);
+            AVX_Data param_4[span];
+            simd_load<span>(param_4, _params + i, half_precision);
+            if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
+            simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
+            simd_sqrt<span>(grad_4, variance_4);
+            simd_add<span>(grad_4, grad_4, eps_4);
+            simd_div<span>(grad_4, momentum_4, grad_4);
+            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
+            simd_store<span>(_params + i, param_4, half_precision);
+            if (dev_params) {
+                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
+            }
+            simd_store<span>(_exp_avg_sq + i, variance_4, false);
+        }
+        if (dev_params) {
+            if (half_precision)
+                launch_param_update_half(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            else
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            _buf_index = !_buf_index;
+        }
+    }
+    *rounded_size = new_rounded_size;
+}
+#endif
--- a/csrc/includes/cpu_adagrad_hip.h
+++ b/csrc/includes/cpu_adagrad_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#define NOMINMAX  // Windows idiosyncrasy
+                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime_api.h>
+#include <stdio.h>
+#include <cassert>
+#include "hip/hip_runtime.h"
+#include "custom_hip_layers.h"
+#include "simd.h"
+#define STEP(SPAN)                                \
+    void Step_##SPAN(float* _params,              \
+                     float* grads,                \
+                     float* _exp_avg_sq,          \
+                     size_t _param_size,          \
+                     __half* dev_param = nullptr, \
+                     bool half_precision = false);
+class Adagrad_Optimizer {
+public:
+    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
+        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
+    {
+        hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
+        hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+        _streams[0] = Context::Instance().GetCurrentStream();
+        _streams[1] = Context::Instance().GetNewStream();
+    }
+    ~Adagrad_Optimizer()
+    {
+        hipHostFree(_doubled_buffer[0]);
+        hipHostFree(_doubled_buffer[1]);
+    }
+#if defined(__AVX512__) or defined(__AVX256__)
+    template <int span>
+    void Step_AVX(size_t* rounded_size,
+                  float* _params,
+                  float* grads,
+                  float* _exp_avg_sq,
+                  size_t param_size,
+                  __half* dev_param = nullptr,
+                  bool half_precision = false);
+#endif
+    STEP(1)
+    STEP(4)
+    STEP(8)
+    inline void SynchronizeStreams()
+    {
+        for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
+    }
+    inline void IncrementStep(size_t step)
+    {
+        _step++;
+        if (_step != step) { _step = step; }
+    }
+    inline void update_state(float lr, float epsilon, float weight_decay)
+    {
+        _alpha = lr;
+        _eps = epsilon;
+        _weight_decay = weight_decay;
+    }
+private:
+    float _alpha;
+    float _eps;
+    float _weight_decay;
+    float _betta1_t;
+    float _betta2_t;
+    size_t _step;
+    float* _doubled_buffer[2];
+    bool _buf_index;
+    hipStream_t _streams[2];
+};
+#if defined(__AVX512__) or defined(__AVX256__)
+template <int span>
+void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
+                                 float* _params,
+                                 float* grads,
+                                 float* _exp_avg_sq,
+                                 size_t _param_size,
+                                 __half* dev_params,
+                                 bool half_precision)
+{
+    size_t new_rounded_size = 0;
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+    float step_size = -1 * _alpha;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
+    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
+    for (size_t t = 0; t < new_rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
+        size_t offset = copy_size + t;
+        if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
+            AVX_Data grad_4[span];
+            simd_load<span>(grad_4, grads + i, half_precision);
+            AVX_Data momentum_4[span];
+            simd_load<span>(momentum_4, grads + i, false);
+            AVX_Data variance_4[span];
+            simd_load<span>(variance_4, _exp_avg_sq + i, false);
+            AVX_Data param_4[span];
+            simd_load<span>(param_4, _params + i, half_precision);
+            if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
+            simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
+            simd_sqrt<span>(grad_4, variance_4);
+            simd_add<span>(grad_4, grad_4, eps_4);
+            simd_div<span>(grad_4, momentum_4, grad_4);
+            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
+            simd_store<span>(_params + i, param_4, half_precision);
+            if (dev_params) {
+                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
+            }
+            simd_store<span>(_exp_avg_sq + i, variance_4, false);
+        }
+        if (dev_params) {
+            if (half_precision)
+                launch_param_update_half(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            else
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            _buf_index = !_buf_index;
+        }
+    }
+    *rounded_size = new_rounded_size;
+}
+#endif
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
 #pragma once
-#include <cpuid.h>
+#define NOMINMAX  // Windows idiosyncrasy
-#include <cuda_fp16.h>
+                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-#include <cuda_runtime_api.h>
-#include <stdio.h>
+#include <cuda_fp16.h>
-#include <x86intrin.h>
+#include <cuda_runtime_api.h>
-#include <cassert>
+#include <stdio.h>
-#include "context.h"
+#include <cassert>
-#include "cublas_v2.h"
+#include "cuda.h"
-#include "cuda.h"
+#include "custom_cuda_layers.h"
-#include "curand.h"
+#include "simd.h"
-#define CUDA_CHECK(callstr)                                                                    \
+#define STEP(SPAN)                                \
-    {                                                                                          \
+    void Step_##SPAN(float* _params,              \
-        cudaError_t error_code = callstr;                                                      \
+                     float* grads,                \
-        if (error_code != cudaSuccess) {                                                       \
+                     float* _exp_avg,             \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+                     float* _exp_avg_sq,          \
-            assert(0);                                                                         \
+                     size_t _param_size,          \
-        }                                                                                      \
+                     __half* dev_param = nullptr, \
-    }
+                     bool half_precision = false);
-#define TILE (1024 * 1024 * 1024)
+class Adam_Optimizer {
+public:
-#if defined(__AVX512__)
+    Adam_Optimizer(float alpha = 1e-3,
-#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
+                   float betta1 = 0.9,
-#define SIMD_LOAD(x) _mm512_loadu_ps(x)
+                   float betta2 = 0.999,
-#define SIMD_SET(x) _mm512_set1_ps(x)
+                   float eps = 1e-8,
-#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
+                   float weight_decay = 0,
-#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
+                   bool adamw_mode = true)
-#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
+        : _alpha(alpha),
-#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+          _betta1(betta1),
-#define SIMD_WIDTH 16
+          _betta2(betta2),
-#else
+          _eps(eps),
-#if defined(__AVX256__)
+          _weight_decay(weight_decay),
-#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
+          _betta1_t(1.0),
-#define SIMD_LOAD(x) _mm256_loadu_ps(x)
+          _betta2_t(1.0),
-#define SIMD_SET(x) _mm256_set1_ps(x)
+          _step(0),
-#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
+          _buf_index(false),
-#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
+          _adamw_mode(adamw_mode)
-#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
+    {
-#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-#define SIMD_WIDTH 8
+        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-#endif
-#endif
+        _streams[0] = Context::Instance().GetCurrentStream();
+        _streams[1] = Context::Instance().GetNewStream();
-class Adam_Optimizer {
+    }
-public:
+    ~Adam_Optimizer()
-    Adam_Optimizer(float alpha = 1e-3,
+    {
-                   float betta1 = 0.9,
+        cudaFreeHost(_doubled_buffer[0]);
-                   float betta2 = 0.999,
+        cudaFreeHost(_doubled_buffer[1]);
-                   float eps = 1e-8,
+    }
-                   float weight_decay = 0,
+#if defined(__AVX512__) or defined(__AVX256__)
-                   bool adamw_mode = true)
+    template <int span>
-        : _alpha(alpha),
+    void Step_AVX(size_t* rounded_size,
-          _betta1(betta1),
+                  float* _params,
-          _betta2(betta2),
+                  float* grads,
-          _eps(eps),
+                  float* _exp_avg,
-          _weight_decay(weight_decay),
+                  float* _exp_avg_sq,
-          _betta1_t(1.0),
+                  size_t param_size,
-          _betta2_t(1.0),
+                  __half* dev_param = nullptr,
-          _step(0),
+                  bool half_precision = false);
-          _buf_index(false),
+#endif
-          _adamw_mode(adamw_mode)
+    STEP(1)
-    {
+    STEP(4)
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
+    STEP(8)
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+    inline void SynchronizeStreams()
+    {
-        _streams[0] = Context::Instance().GetCurrentStream();
+        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
-        _streams[1] = Context::Instance().GetNewStream();
+    }
-    }
+    inline void IncrementStep(size_t step, float beta1, float beta2)
-    ~Adam_Optimizer()
+    {
-    {
+        if (beta1 != _betta1 || beta2 != _betta2) {
-        cudaFreeHost(_doubled_buffer[0]);
+            _step = step;
-        cudaFreeHost(_doubled_buffer[1]);
+            _betta1 = beta1;
-    }
+            _betta2 = beta2;
-    void Step(float* _params,
+            _betta1_t = std::pow(_betta1, step);
-              float* grads,
+            _betta2_t = std::pow(_betta2, step);
-              float* _exp_avg,
+        } else {
-              float* _exp_avg_sq,
+            _step++;
-              size_t param_size,
+            if (_step != step) {
-              __half* dev_param = nullptr);
+                _betta1_t = std::pow(_betta1, step);
-    void Step_4(float* _params,
+                _betta2_t = std::pow(_betta2, step);
-                float* grads,
+                _step = step;
-                float* _exp_avg,
+            } else {
-                float* _exp_avg_sa,
+                _betta1_t *= _betta1;
-                size_t param_size,
+                _betta2_t *= _betta2;
-                __half* dev_param = nullptr);
+            }
-    void Step_8(float* _params,
+        }
-                float* grads,
+    }
-                float* _exp_avg,
+    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
-                float* _exp_avg_sq,
+    {
-                size_t _param_size,
+        _alpha = lr;
-                __half* dev_params = nullptr);
+        _eps = epsilon;
-    inline void SynchronizeStreams()
+        _weight_decay = weight_decay;
-    {
-        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
+        _bias_correction1 = 1.0f;
-    }
+        _bias_correction2 = 1.0f;
-    inline void IncrementStep(size_t step, float beta1, float beta2)
+        if (bias_correction == 1) {
-    {
+            _bias_correction1 = 1 - _betta1_t;
-        if (beta1 != _betta1 || beta2 != _betta2) {
+            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
-            _step = step;
+        }
-            _betta1 = beta1;
+    }
-            _betta2 = beta2;
-            _betta1_t = std::pow(_betta1, step);
+private:
-            _betta2_t = std::pow(_betta2, step);
+    float _alpha;
-        } else {
+    float _betta1;
-            _step++;
+    float _betta2;
-            if (_step != step) {
+    float _eps;
-                _betta1_t = std::pow(_betta1, step);
+    float _weight_decay;
-                _betta2_t = std::pow(_betta2, step);
-                _step = step;
+    float _betta1_t;
-            } else {
+    float _betta2_t;
-                _betta1_t *= _betta1;
+    size_t _step;
-                _betta2_t *= _betta2;
-            }
+    float _bias_correction1;
-        }
+    float _bias_correction2;
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
+    float* _doubled_buffer[2];
-    {
+    bool _buf_index;
-        _alpha = lr;
+    bool _adamw_mode;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
+    cudaStream_t _streams[2];
+};
-        _bias_correction1 = 1.0f;
-        _bias_correction2 = 1.0f;
+#if defined(__AVX512__) or defined(__AVX256__)
-        if (bias_correction == 1) {
+template <int span>
-            _bias_correction1 = 1 - _betta1_t;
+void Adam_Optimizer::Step_AVX(size_t* rounded_size,
-            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
+                              float* _params,
-        }
+                              float* grads,
-    }
+                              float* _exp_avg,
+                              float* _exp_avg_sq,
-private:
+                              size_t _param_size,
-#if defined(__AVX512__) or defined(__AVX256__)
+                              __half* dev_params,
-    union AVX_Data {
+                              bool half_precision)
-#if defined(__AVX512__)
+{
-        __m512 data;
+    size_t new_rounded_size = 0;
-#else
-        __m256 data;
+    AVX_Data betta1_4;
-#endif
+    betta1_4.data = SIMD_SET(_betta1);
-        // float data_f[16];
+    AVX_Data betta2_4;
-    };
+    betta2_4.data = SIMD_SET(_betta2);
-#endif
+    float betta1_minus1 = 1 - _betta1;
-    float _alpha;
+    float betta2_minus1 = 1 - _betta2;
-    float _betta1;
+    AVX_Data betta1_minus1_4;
-    float _betta2;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
-    float _eps;
+    AVX_Data betta2_minus1_4;
-    float _weight_decay;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
-    float _betta1_t;
+    AVX_Data bias2_sqrt;
-    float _betta2_t;
+    bias2_sqrt.data = SIMD_SET(_bias_correction2);
-    size_t _step;
+    AVX_Data eps_4;
-    float _bias_correction1;
+    eps_4.data = SIMD_SET(_eps);
-    float _bias_correction2;
+    float step_size = -1 * _alpha / _bias_correction1;
-    float* _doubled_buffer[2];
+    AVX_Data step_size_4;
-    bool _buf_index;
+    step_size_4.data = SIMD_SET(step_size);
-    bool _adamw_mode;
+    float w_decay = -1 * _alpha * _weight_decay;
-    cudaStream_t _streams[2];
+    AVX_Data weight_decay4;
-};
+    if (_weight_decay > 0)
+        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
+    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
+    for (size_t t = 0; t < new_rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
+        size_t offset = copy_size + t;
+        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
+            AVX_Data grad_4[span];
+            simd_load<span>(grad_4, grads + i, half_precision);
+            AVX_Data momentum_4[span];
+            simd_load<span>(momentum_4, _exp_avg + i, false);
+            AVX_Data variance_4[span];
+            simd_load<span>(variance_4, _exp_avg_sq + i, false);
+            AVX_Data param_4[span];
+            simd_load<span>(param_4, _params + i, half_precision);
+            if (_weight_decay > 0 && !_adamw_mode) {
+                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
+            }
+            simd_mul<span>(momentum_4, momentum_4, betta1_4);
+            simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
+            simd_mul<span>(variance_4, variance_4, betta2_4);
+            simd_mul<span>(grad_4, grad_4, grad_4);
+            simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
+            simd_sqrt<span>(grad_4, variance_4);
+            simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
+            simd_div<span>(grad_4, momentum_4, grad_4);
+            if (_weight_decay > 0 && _adamw_mode) {
+                simd_fma<span>(param_4, param_4, weight_decay4, param_4);
+            }
+            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
+            simd_store<span>(_params + i, param_4, half_precision);
+            if (dev_params) {
+                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
+            }
+            simd_store<span>(_exp_avg + i, momentum_4, false);
+            simd_store<span>(_exp_avg_sq + i, variance_4, false);
+        }
+        if (dev_params) {
+            if (half_precision)
+                launch_param_update_half(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            else
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            _buf_index = !_buf_index;
+        }
+    }
+    *rounded_size = new_rounded_size;
+}
+#endif
--- a/csrc/includes/cpu_adam_hip.h
+++ b/csrc/includes/cpu_adam_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#define NOMINMAX  // Windows idiosyncrasy
+                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime_api.h>
+#include <stdio.h>
+#include <cassert>
+#include "hip/hip_runtime.h"
+#include "custom_hip_layers.h"
+#include "simd.h"
+#define STEP(SPAN)                                \
+    void Step_##SPAN(float* _params,              \
+                     float* grads,                \
+                     float* _exp_avg,             \
+                     float* _exp_avg_sq,          \
+                     size_t _param_size,          \
+                     __half* dev_param = nullptr, \
+                     bool half_precision = false);
+class Adam_Optimizer {
+public:
+    Adam_Optimizer(float alpha = 1e-3,
+                   float betta1 = 0.9,
+                   float betta2 = 0.999,
+                   float eps = 1e-8,
+                   float weight_decay = 0,
+                   bool adamw_mode = true)
+        : _alpha(alpha),
+          _betta1(betta1),
+          _betta2(betta2),
+          _eps(eps),
+          _weight_decay(weight_decay),
+          _betta1_t(1.0),
+          _betta2_t(1.0),
+          _step(0),
+          _buf_index(false),
+          _adamw_mode(adamw_mode)
+    {
+        hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
+        hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
+        _streams[0] = Context::Instance().GetCurrentStream();
+        _streams[1] = Context::Instance().GetNewStream();
+    }
+    ~Adam_Optimizer()
+    {
+        hipHostFree(_doubled_buffer[0]);
+        hipHostFree(_doubled_buffer[1]);
+    }
+#if defined(__AVX512__) or defined(__AVX256__)
+    template <int span>
+    void Step_AVX(size_t* rounded_size,
+                  float* _params,
+                  float* grads,
+                  float* _exp_avg,
+                  float* _exp_avg_sq,
+                  size_t param_size,
+                  __half* dev_param = nullptr,
+                  bool half_precision = false);
+#endif
+    STEP(1)
+    STEP(4)
+    STEP(8)
+    inline void SynchronizeStreams()
+    {
+        for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
+    }
+    inline void IncrementStep(size_t step, float beta1, float beta2)
+    {
+        if (beta1 != _betta1 || beta2 != _betta2) {
+            _step = step;
+            _betta1 = beta1;
+            _betta2 = beta2;
+            _betta1_t = std::pow(_betta1, step);
+            _betta2_t = std::pow(_betta2, step);
+        } else {
+            _step++;
+            if (_step != step) {
+                _betta1_t = std::pow(_betta1, step);
+                _betta2_t = std::pow(_betta2, step);
+                _step = step;
+            } else {
+                _betta1_t *= _betta1;
+                _betta2_t *= _betta2;
+            }
+        }
+    }
+    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
+    {
+        _alpha = lr;
+        _eps = epsilon;
+        _weight_decay = weight_decay;
+        _bias_correction1 = 1.0f;
+        _bias_correction2 = 1.0f;
+        if (bias_correction == 1) {
+            _bias_correction1 = 1 - _betta1_t;
+            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
+        }
+    }
+private:
+    float _alpha;
+    float _betta1;
+    float _betta2;
+    float _eps;
+    float _weight_decay;
+    float _betta1_t;
+    float _betta2_t;
+    size_t _step;
+    float _bias_correction1;
+    float _bias_correction2;
+    float* _doubled_buffer[2];
+    bool _buf_index;
+    bool _adamw_mode;
+    hipStream_t _streams[2];
+};
+#if defined(__AVX512__) or defined(__AVX256__)
+template <int span>
+void Adam_Optimizer::Step_AVX(size_t* rounded_size,
+                              float* _params,
+                              float* grads,
+                              float* _exp_avg,
+                              float* _exp_avg_sq,
+                              size_t _param_size,
+                              __half* dev_params,
+                              bool half_precision)
+{
+    size_t new_rounded_size = 0;
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
+    AVX_Data bias2_sqrt;
+    bias2_sqrt.data = SIMD_SET(_bias_correction2);
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+    float step_size = -1 * _alpha / _bias_correction1;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+    float w_decay = -1 * _alpha * _weight_decay;
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0)
+        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
+    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
+    for (size_t t = 0; t < new_rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
+        size_t offset = copy_size + t;
+        if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
+            AVX_Data grad_4[span];
+            simd_load<span>(grad_4, grads + i, half_precision);
+            AVX_Data momentum_4[span];
+            simd_load<span>(momentum_4, _exp_avg + i, false);
+            AVX_Data variance_4[span];
+            simd_load<span>(variance_4, _exp_avg_sq + i, false);
+            AVX_Data param_4[span];
+            simd_load<span>(param_4, _params + i, half_precision);
+            if (_weight_decay > 0 && !_adamw_mode) {
+                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
+            }
+            simd_mul<span>(momentum_4, momentum_4, betta1_4);
+            simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
+            simd_mul<span>(variance_4, variance_4, betta2_4);
+            simd_mul<span>(grad_4, grad_4, grad_4);
+            simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
+            simd_sqrt<span>(grad_4, variance_4);
+            simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
+            simd_div<span>(grad_4, momentum_4, grad_4);
+            if (_weight_decay > 0 && _adamw_mode) {
+                simd_fma<span>(param_4, param_4, weight_decay4, param_4);
+            }
+            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
+            simd_store<span>(_params + i, param_4, half_precision);
+            if (dev_params) {
+                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
+            }
+            simd_store<span>(_exp_avg + i, momentum_4, false);
+            simd_store<span>(_exp_avg_sq + i, variance_4, false);
+        }
+        if (dev_params) {
+            if (half_precision)
+                launch_param_update_half(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            else
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
+            _buf_index = !_buf_index;
+        }
+    }
+    *rounded_size = new_rounded_size;
+}
+#endif
--- a/csrc/includes/cublas_wrappers.h
+++ b/csrc/includes/cublas_wrappers.h
@@ -5,7 +5,9 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <mma.h>
+#endif
 #include <stdio.h>
 int cublas_gemm_ex(cublasHandle_t handle,
@@ -19,7 +21,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                   const float* A,
                   const float* B,
                   float* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
 int cublas_gemm_ex(cublasHandle_t handle,
                   cublasOperation_t transa,
@@ -32,7 +38,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                   const __half* A,
                   const __half* B,
                   __half* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int m,
@@ -49,7 +59,11 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int stride_B,
                                int stride_C,
                                int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int m,
@@ -66,4 +80,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int stride_B,
                                int stride_C,
                                int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
--- a/csrc/includes/cublas_wrappers_hip.h
+++ b/csrc/includes/cublas_wrappers_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <assert.h>
+#include <rocblas.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#ifndef __HIP_PLATFORM_HCC__
+#include <mma.h>
+#endif
+#include <stdio.h>
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const float* A,
+                   const float* B,
+                   float* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
+                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const __half* A,
+                   const __half* B,
+                   __half* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
+                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const float* A,
+                                const float* B,
+                                float* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
+                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const __half* A,
+                                const __half* B,
+                                __half* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
+                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@@ -5,12 +5,29 @@
 #include <stdio.h>
 #include <stdlib.h>
+#ifdef __HIP_PLATFORM_HCC__
+#define HALF_PRECISION_AVAILABLE = 1
+#include <hip/hip_cooperative_groups.h>
+#else
+#if __CUDA_ARCH__ >= 700
+#define HALF_PRECISION_AVAILABLE = 1
+#endif
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>
 #include "context.h"
 #include "cublas_wrappers.h"
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        cudaError_t error_code = callstr;                                                      \
+        if (error_code != cudaSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
 #define MAX_THREADS 1024
 #define THREADS 256
@@ -24,6 +41,34 @@
 #define MAX_REGISTERS 256
+#define MAX_REG 256
+#define WARP_SIZE_BITS 5
+template <typename T>
+void launch_quantize_kernel(T* vals,
+                            int total_count,
+                            int group_num,
+                            int num_bits,
+                            cudaStream_t stream);
+template <typename T>
+void launch_sr_quantize_kernel(T* vals,
+                               int total_count,
+                               int group_num,
+                               int num_bits,
+                               cudaStream_t stream);
+template <typename T>
+void launch_quantize_kernel_asym(T* vals,
+                                 int total_count,
+                                 int group_num,
+                                 int num_bits,
+                                 cudaStream_t stream);
+template <typename T>
+void launch_sr_quantize_kernel_asym(T* vals,
+                                    int total_count,
+                                    int group_num,
+                                    int num_bits,
+                                    cudaStream_t stream);
 // Fused bias add with gelu activation
 template <typename T>
 void launch_bias_gelu(const T* input,
@@ -255,3 +300,4 @@ void launch_fuse_transpose_bias_kernel(const T* inp,
                                       cudaStream_t stream);
 void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
+void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream);
--- a/csrc/includes/custom_hip_layers.h
+++ b/csrc/includes/custom_hip_layers.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __HIP_PLATFORM_HCC__
+#define HALF_PRECISION_AVAILABLE = 1
+#include <hip/hip_cooperative_groups.h>
+#else
+#if __CUDA_ARCH__ >= 700
+#define HALF_PRECISION_AVAILABLE = 1
+#endif
+#include <cooperative_groups.h>
+#endif
+#include <hiprand/hiprand_kernel.h>
+#include "context_hip.h"
+#include "cublas_wrappers_hip.h"
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        hipError_t error_code = callstr;                                                      \
+        if (error_code != hipSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
+#define MAX_THREADS 1024
+#define THREADS 256
+#define MAX_THREAD_STRIDE 32
+#define TILE_DIM 32
+// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
+// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
+#define MAX_THREAD_ITERATIONS 8  // Maximum 8K
+#define MAX_WARP_NUM 32
+#define MAX_REGISTERS 256
+#define MAX_REG 256
+#define WARP_SIZE_BITS 5
+template <typename T>
+void launch_quantize_kernel(T* vals,
+                            int total_count,
+                            int group_num,
+                            int num_bits,
+                            hipStream_t stream);
+template <typename T>
+void launch_sr_quantize_kernel(T* vals,
+                               int total_count,
+                               int group_num,
+                               int num_bits,
+                               hipStream_t stream);
+template <typename T>
+void launch_quantize_kernel_asym(T* vals,
+                                 int total_count,
+                                 int group_num,
+                                 int num_bits,
+                                 hipStream_t stream);
+template <typename T>
+void launch_sr_quantize_kernel_asym(T* vals,
+                                    int total_count,
+                                    int group_num,
+                                    int num_bits,
+                                    hipStream_t stream);
+// Fused bias add with gelu activation
+template <typename T>
+void launch_bias_gelu(const T* input,
+                      const T* bias,
+                      T* output,
+                      int intermediate_size,
+                      int batch_size,
+                      hipStream_t stream);
+template <typename T>
+void launch_gelu(const T* input,
+                 T* output,
+                 int intermediate_size,
+                 int batch_size,
+                 hipStream_t stream);
+template <typename T>
+void launch_d_gelu(T* d_output,
+                   const T* input,
+                   const T* bias,
+                   int intermediate_size,
+                   int batch_size,
+                   hipStream_t stream);
+// Custom fused bias add with layer normalization
+template <typename T>
+void launch_bias_residual_layer_norm(T* vals,
+                                     const T* residual,
+                                     const T* gamma,
+                                     const T* beta,
+                                     float epsilon,
+                                     int batch_size,
+                                     int hidden_dim,
+                                     hipStream_t stream,
+                                     bool preLayerNorm,
+                                     bool training,
+                                     T* vars,
+                                     T* means);
+template <typename T>
+void launch_bias_residual_layer_norm(T* vals,
+                                     const T* residual,
+                                     const T* gamma,
+                                     const T* beta,
+                                     float epsilon,
+                                     int batch_size,
+                                     int hidden_dim,
+                                     hipStream_t stream,
+                                     bool preLayerNorm,
+                                     bool training,
+                                     T* vars);
+template <typename T>
+void launch_layerNorm_backward_fused_add(const T* out_grad1,
+                                         const T* out_grad2,
+                                         const T* X_data,
+                                         const T* vars,
+                                         const T* means,
+                                         const T* gamma,
+                                         T* gamma_grad,
+                                         T* betta_grad,
+                                         T* inp_grad,
+                                         int batch_size,
+                                         int hidden_dim,
+                                         hipStream_t stream[2]);
+template <typename T>
+void launch_layerNorm_backward_fused_add(const T* out_grad1,
+                                         const T* out_grad2,
+                                         const T* vals_hat,
+                                         const T* vars,
+                                         const T* gamma,
+                                         T* gamma_grad,
+                                         T* betta_grad,
+                                         T* inp_grad,
+                                         int batch_size,
+                                         int hidden_dim,
+                                         hipStream_t stream[2],
+                                         bool invertible = false,
+                                         const T* betta = nullptr);
+template <typename T>
+void launch_layerNorm_backward(const T* out_grad,
+                               const T* X_data,
+                               const T* vars,
+                               const T* means,
+                               const T* gamma,
+                               T* gamma_grad,
+                               T* betta_grad,
+                               T* inp_grad,
+                               int batch_size,
+                               int hidden_dim,
+                               hipStream_t stream[2]);
+template <typename T>
+void launch_layerNorm_backward(const T* out_grad,
+                               const T* vals_hat,
+                               const T* vars,
+                               const T* gamma,
+                               T* gamma_grad,
+                               T* betta_grad,
+                               T* inp_grad,
+                               int batch_size,
+                               int hidden_dim,
+                               hipStream_t stream[2],
+                               bool invertible = false,
+                               const T* betta = nullptr);
+template <typename T>
+void launch_layerNorm_backward_nreversible(const T* out_grad,
+                                           const T* vals,
+                                           const T* out_grad_trans,
+                                           const T* vals_trans,
+                                           const T* means,
+                                           const T* vars,
+                                           const T* gamma,
+                                           T* gamma_grad,
+                                           T* betta_grad,
+                                           T* inp_grad,
+                                           int batch_size,
+                                           int hidden_dim,
+                                           hipStream_t stream[2]);
+template <typename T>
+void Transpose(const T* inp_mat, T* out_mat, int rows, int cols, hipStream_t stream);
+template <typename T>
+void launch_attn_softmax_backward(T* out_grad,
+                                  const T* soft_inp,
+                                  int batch_size,
+                                  int heads,
+                                  int seq_length,
+                                  hipStream_t stream);
+template <typename T>
+void launch_attn_softmax_backward_v2(T* out_grad,
+                                     const T* soft_inp,
+                                     int batch_size,
+                                     int heads,
+                                     int seq_length,
+                                     hipStream_t stream);
+// Custom softmax with scaling and attention mask addition
+template <typename T>
+void launch_attn_softmax(T* vals,
+                         const T* attn_mask,
+                         int batch_size,
+                         int heads,
+                         int sequence_length,
+                         hipStream_t stream);
+template <typename T>
+void launch_transform_0213(T* output,
+                           const T* vals,
+                           int batch_size,
+                           int seq_length,
+                           int hidden_dim,
+                           int heads,
+                           hipStream_t stream);
+// Custom bias add
+template <typename T>
+void launch_bias_add_transform_0213(T* outputs,
+                                    const T* vals,
+                                    const T* bias,
+                                    int batch_size,
+                                    int seq_length,
+                                    int hidden_dim,
+                                    int heads,
+                                    hipStream_t stream,
+                                    int trans_count);
+// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
+template <typename T>
+void launch_transform4d_0213(T* out,
+                             const T* in,
+                             int batch_size,
+                             int heads,
+                             int seq_length,
+                             int hidden_dim,
+                             hipStream_t stream,
+                             int trans_count);
+template <typename T>
+void launch_dropout(T* vals,
+                    const T* bias,
+                    uint8_t* mask,
+                    int batch,
+                    int dim,
+                    float ratio,
+                    hipStream_t stream);
+template <typename T>
+void launch_dropout(T* vals_out,
+                    const T* vals,
+                    uint8_t* mask,
+                    int total_count,
+                    int dim,
+                    float ratio,
+                    hipStream_t stream,
+                    bool bwd = false);
+template <typename T>
+void launch_dropout(T* out,
+                    const T* vals,
+                    const T* residual,
+                    const T* bias,
+                    uint8_t* mask,
+                    int batch,
+                    int dim,
+                    float ratio,
+                    hipStream_t stream);
+template <typename T>
+void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, hipStream_t stream);
+template <typename T>
+void launch_dropout_grad(T* vals_out,
+                         const T* vals,
+                         uint8_t* mask,
+                         int total_count,
+                         float ratio,
+                         hipStream_t stream);
+template <typename T>
+void launch_fuse_transpose_bias_kernel(const T* inp,
+                                       T* out,
+                                       int rows,
+                                       int cols,
+                                       hipStream_t stream);
+void launch_param_update(const float* input, __half* output, int size, hipStream_t stream);
+void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream);
--- a/csrc/includes/dropout.h
+++ b/csrc/includes/dropout.h
 #pragma once
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <stdio.h>
 template <typename T>
 class Dropout {
 public:
    struct Config {
        float ratio;
        uint32_t dim;
        bool training;
        Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
        float RATIO() const { return training ? ratio : 0.0; }
        inline void SetDim(uint32_t d) { dim = d; }
    };
    Dropout(const Config& config) : _config(config), _mask(nullptr) {}
    virtual ~Dropout() {}
    void Forward(int bsz, T* out, const T* vals, cudaStream_t stream, bool bwd = false)
    {
        launch_dropout<T>(
            out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
    }
    void ForwardWithBias(int bsz, T* vals, const T* bias, cudaStream_t stream)
    {
        launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
    }
    void ForwardWithBias(int bsz,
                         T* out,
                         const T* vals,
                         const T* residual,
                         const T* bias,
                         cudaStream_t stream)
    {
        launch_dropout<T>(
            out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
    }
    void Backward(int bsz, T* d_vals, cudaStream_t stream)
    {
        launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
    }
    void Backward(int bsz, T* d_vals_out, const T* d_vals, cudaStream_t stream)
    {
        launch_dropout_grad<T>(
            d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
    }
    bool HasDropout() const { return _config.RATIO() > 0.0; }
    void SetTrainingMode(bool training) { _config.training = training; }
    void SetMask(uint8_t* mask)
    {
        if (!mask) { throw std::runtime_error("Dropout mask is null."); }
        _mask = mask;
    }
    Config GetConfig() const { return _config; }
    inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
 private:
    uint8_t* _mask;
    Config _config;
 };
--- a/csrc/includes/dropout_hip.h
+++ b/csrc/includes/dropout_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+template <typename T>
+class Dropout {
+public:
+    struct Config {
+        float ratio;
+        uint32_t dim;
+        bool training;
+        Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
+        float RATIO() const { return training ? ratio : 0.0; }
+        inline void SetDim(uint32_t d) { dim = d; }
+    };
+    Dropout(const Config& config) : _config(config), _mask(nullptr) {}
+    virtual ~Dropout() {}
+    void Forward(int bsz, T* out, const T* vals, hipStream_t stream, bool bwd = false)
+    {
+        launch_dropout<T>(
+            out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
+    }
+    void ForwardWithBias(int bsz, T* vals, const T* bias, hipStream_t stream)
+    {
+        launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
+    }
+    void ForwardWithBias(int bsz,
+                         T* out,
+                         const T* vals,
+                         const T* residual,
+                         const T* bias,
+                         hipStream_t stream)
+    {
+        launch_dropout<T>(
+            out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
+    }
+    void Backward(int bsz, T* d_vals, hipStream_t stream)
+    {
+        launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
+    }
+    void Backward(int bsz, T* d_vals_out, const T* d_vals, hipStream_t stream)
+    {
+        launch_dropout_grad<T>(
+            d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
+    }
+    bool HasDropout() const { return _config.RATIO() > 0.0; }
+    void SetTrainingMode(bool training) { _config.training = training; }
+    void SetMask(uint8_t* mask)
+    {
+        if (!mask) { throw std::runtime_error("Dropout mask is null."); }
+        _mask = mask;
+    }
+    Config GetConfig() const { return _config; }
+    inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
+private:
+    uint8_t* _mask;
+    Config _config;
+};
--- a/csrc/includes/ds_transformer_cuda.h
+++ b/csrc/includes/ds_transformer_cuda.h
@@ -34,12 +34,12 @@ struct BertGemmAlgos {
 template <typename T>
 class BertTransformerLayer {
 public:
-    BertTransformerLayer(int layer_id,
+    BertTransformerLayer(unsigned layer_id,
-                         int batch_size,
+                         unsigned batch_size,
-                         int hidden_size,
+                         unsigned hidden_size,
-                         int num_heads,
+                         unsigned num_heads,
-                         int intermediate_size,
+                         unsigned intermediate_size,
-                         int seq_length,
+                         unsigned seq_length,
                         float attn_dropout_ratio,
                         float hidden_output_dropout_ratio,
                         float layer_norm_eps,
@@ -52,7 +52,7 @@ public:
    virtual ~BertTransformerLayer();
-    void Forward(int bsz,
+    void Forward(unsigned bsz,
                 const T* input_ptr,
                 const T* input_mask_ptr,
                 const T* attn_qkvw_ptr,
@@ -80,7 +80,7 @@ public:
                 T* gelu_inp_ptr,
                 T* ff2_inp_ptr);
-    void Backward(int bsz,
+    void Backward(unsigned bsz,
                  const T* grad_output_ptr,
                  const T* input_ptr,
                  const T* output_ptr,
@@ -128,13 +128,13 @@ public:
                                T* attn_layer_norm_var,
                                T* attn_layer_norm_mean);
-    inline int GetBatchSize() const { return _batch_size; }
+    inline unsigned GetBatchSize() const { return _batch_size; }
-    inline int GetNumHeads() const { return _heads; }
+    inline unsigned GetNumHeads() const { return _heads; }
-    inline int GetSeqLength() const { return _seq_length; }
+    inline unsigned GetSeqLength() const { return _seq_length; }
-    inline int GetIntermediateSize() const { return _intermediate_size; }
+    inline unsigned GetIntermediateSize() const { return _intermediate_size; }
-    void SetSeqLength(int seq_len);
+    void SetSeqLength(unsigned seq_len);
-    inline int GetHiddenSize() const { return _hidden_size; }
+    inline unsigned GetHiddenSize() const { return _hidden_size; }
    void SetTrainingMode(bool training);
    inline bool IsTrainingMode() const { return _training; }
    inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
@@ -144,13 +144,13 @@ private:
    size_t getWorkspaceSize(int maxBatchSize) const;
    // Params
-    int _layer_id;
+    unsigned _layer_id;
-    int _batch_size;
+    unsigned _batch_size;
-    int _hidden_size;
+    unsigned _hidden_size;
-    int _heads;
+    unsigned _heads;
-    int _size_per_head;
+    unsigned _size_per_head;
-    int _intermediate_size;
+    unsigned _intermediate_size;
-    int _seq_length;
+    unsigned _seq_length;
    bool _pre_or_postLayerNorm;
@@ -179,6 +179,6 @@ private:
    bool _normalize_invertible;
    bool _gelu_checkpoint;
-    // High Performace flags
+    // High Performance flags
    bool _stochastic_mode;
 };
--- a/csrc/includes/ds_transformer_hip.h
+++ b/csrc/includes/ds_transformer_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <hip/hip_runtime_api.h>
+#include <hiprand/hiprand.h>
+#include <memory>
+#include <vector>
+#include "rocblas.h"
+#include "hip/hip_runtime.h"
+#include "dropout_hip.h"
+#include "feed_forward_hip.h"
+#include "gelu_hip.h"
+#include "general_kernels_hip.h"
+#include "normalize_layer_hip.h"
+#include "softmax_hip.h"
+#include "strided_batch_gemm_hip.h"
+struct BertGemmAlgos {
+    int m_gemm_qkv_algo;
+    int m_gemm_inter_algo;
+    int m_gemm_output_algo;
+    int m_gemm_batch1_algo;
+    int m_gemm_batch2_algo;
+    BertGemmAlgos()
+        : m_gemm_qkv_algo(-1),
+          m_gemm_inter_algo(-1),
+          m_gemm_output_algo(-1),
+          m_gemm_batch1_algo(-1),
+          m_gemm_batch2_algo(-1)
+    {
+    }
+};
+template <typename T>
+class BertTransformerLayer {
+public:
+    BertTransformerLayer(unsigned layer_id,
+                         unsigned batch_size,
+                         unsigned hidden_size,
+                         unsigned num_heads,
+                         unsigned intermediate_size,
+                         unsigned seq_length,
+                         float attn_dropout_ratio,
+                         float hidden_output_dropout_ratio,
+                         float layer_norm_eps,
+                         bool pre_or_postLayerNorm,
+                         const std::vector<std::array<int, 3>>& gemm_algos,
+                         bool attn_dropout_checkpoint,
+                         bool normalize_invertible,
+                         bool gelu_checkpoint,
+                         bool stochastic_mode);
+    virtual ~BertTransformerLayer();
+    void Forward(unsigned bsz,
+                 const T* input_ptr,
+                 const T* input_mask_ptr,
+                 const T* attn_qkvw_ptr,
+                 const T* attn_qkvb_ptr,
+                 const T* attn_ow_ptr,
+                 const T* attn_ob_ptr,
+                 const T* attn_nw_ptr,
+                 const T* attn_nb_ptr,
+                 const T* inter_w_ptr,
+                 const T* inter_b_ptr,
+                 const T* output_w_ptr,
+                 const T* output_b_ptr,
+                 const T* norm_w_ptr,
+                 const T* norm_b_ptr,
+                 T* out_ptr,
+                 T* inp_norm_ptr,
+                 T* q_tf_ptr,
+                 T* k_tf_ptr,
+                 T* v_tf_ptr,
+                 T* softmax_output_ptr,
+                 T* ctx_bufB_ptr,
+                 T* attn_o_inp_ptr,
+                 T* add_res_ptr,
+                 T* ff1_inp_ptr,
+                 T* gelu_inp_ptr,
+                 T* ff2_inp_ptr);
+    void Backward(unsigned bsz,
+                  const T* grad_output_ptr,
+                  const T* input_ptr,
+                  const T* output_ptr,
+                  const T* inp_norm_ptr,
+                  const T* q_tf_ptr,
+                  const T* k_tf_ptr,
+                  const T* v_tf_ptr,
+                  const T* softmax_output_ptr,
+                  const T* ctx_bufB_ptr,
+                  const T* attn_o_inp_ptr,
+                  const T* add_res_ptr,
+                  const T* ff1_inp_ptr,
+                  const T* gelu_inp_ptr,
+                  const T* ff2_inp_ptr,
+                  const T* input_mask_ptr,
+                  const T* attn_qkvw_ptr,
+                  const T* attn_ow_ptr,
+                  const T* attn_nw_ptr,
+                  const T* attn_nb_ptr,
+                  const T* inter_w_ptr,
+                  const T* inter_b_ptr,
+                  const T* output_w_ptr,
+                  const T* norm_w_ptr,
+                  const T* norm_b_ptr,
+                  T* grad_input_ptr,
+                  T* grad_attn_qkvw_ptr,
+                  T* grad_attn_qkvb_ptr,
+                  T* grad_attn_ow_ptr,
+                  T* grad_attn_ob_ptr,
+                  T* grad_attn_nw_ptr,
+                  T* grad_attn_nb_ptr,
+                  T* grad_inter_w_ptr,
+                  T* grad_inter_b_ptr,
+                  T* grad_output_w_ptr,
+                  T* grad_output_b_ptr,
+                  T* grad_norm_w_ptr,
+                  T* grad_norm_b_ptr);
+    void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
+                                uint8_t* attn_output_dropout_mask_ptr,
+                                uint8_t* layer_output_dropout_mask_ptr,
+                                T* layer_norm_var,
+                                T* layer_norm_mean,
+                                T* attn_layer_norm_var,
+                                T* attn_layer_norm_mean);
+    inline unsigned GetBatchSize() const { return _batch_size; }
+    inline unsigned GetNumHeads() const { return _heads; }
+    inline unsigned GetSeqLength() const { return _seq_length; }
+    inline unsigned GetIntermediateSize() const { return _intermediate_size; }
+    void SetSeqLength(unsigned seq_len);
+    inline unsigned GetHiddenSize() const { return _hidden_size; }
+    void SetTrainingMode(bool training);
+    inline bool IsTrainingMode() const { return _training; }
+    inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
+private:
+    void Initialize();
+    size_t getWorkspaceSize(int maxBatchSize) const;
+    // Params
+    unsigned _layer_id;
+    unsigned _batch_size;
+    unsigned _hidden_size;
+    unsigned _heads;
+    unsigned _size_per_head;
+    unsigned _intermediate_size;
+    unsigned _seq_length;
+    bool _pre_or_postLayerNorm;
+    rocblas_handle _cublasHandle;
+    hipStream_t _stream;
+    // layers
+    FeedForward<T> _qkv_linear;
+    FeedForward<T> _attn_out_linear;
+    Normalize_Layer<T> _attn_layer_norm;
+    Normalize_Layer<T> _layer_norm;
+    Normalize_Layer<T>* _last_normalize;
+    FeedForward<T> _ff1, _ff2;
+    Softmax<T> _softmax;
+    Gelu<T> _gelu;
+    Dropout<T> _attn_prob_dropout;
+    Dropout<T> _attn_output_dropout;
+    Dropout<T> _layer_output_dropout;
+    StridedBatchGemm<T> _attn_scores;
+    StridedBatchGemm<T> _attn_context;
+    bool _training;
+    // Memory saving flags
+    bool _attn_dropout_checkpoint;
+    bool _normalize_invertible;
+    bool _gelu_checkpoint;
+    // High Performance flags
+    bool _stochastic_mode;
+};
--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
 #ifndef __FEEDFORWARD_H__
 #define __FEEDFORWARD_H__
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <stdio.h>
 #include "custom_cuda_layers.h"
 template <typename T>
 class FeedForward {
 public:
    struct Config {
        int batchSize, outputSize;
        int inputSize;
        std::array<int, 3> gemm_algos;
        Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
            : batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
        {
        }
    };
    FeedForward(Config config) : config_(config) {}
    ~FeedForward() {}
    void Forward(int bsz,
                 const T* input_ptr,
                 const T* weights,
                 T* out,
                 cublasHandle_t& _cublasHandle)
    {
        float alpha = T(1.);
        float beta = T(0.);
        cublas_gemm_ex(_cublasHandle,
                       CUBLAS_OP_T,
                       CUBLAS_OP_N,
                       config_.outputSize,
                       bsz,
                       config_.inputSize,
                       &alpha,
                       &beta,
                       weights,
                       input_ptr,
                       out,
-                       cublasGemmAlgo_t(config_.gemm_algos[0]));
+#ifdef __HIP_PLATFORM_HCC__
-    }
+                       rocblas_gemm_algo(config_.gemm_algos[0]));
-    void Backward(int bsz,
+#else
-                  const T* out_grad,
+                       cublasGemmAlgo_t(config_.gemm_algos[0]));
-                  const T* input_ptr,
+#endif
-                  const T* weights,
+    }
-                  T* weights_grad,
+    void Backward(int bsz,
-                  T* bias_grad,
+                  const T* out_grad,
-                  cublasHandle_t& _cublasHandle,
+                  const T* input_ptr,
-                  cudaStream_t& stream,
+                  const T* weights,
-                  T* inp_grad_out = nullptr,
+                  T* weights_grad,
-                  T* out_grad_trans_out = nullptr)
+                  T* bias_grad,
-    {
+                  cublasHandle_t& _cublasHandle,
-        float alpha = (T)1.0, beta = (T)0.0;
+                  cudaStream_t& stream,
-        cublas_gemm_ex(_cublasHandle,
+                  T* inp_grad_out = nullptr,
-                       CUBLAS_OP_N,
+                  T* out_grad_trans_out = nullptr)
-                       CUBLAS_OP_T,
+    {
-                       config_.inputSize,
+        float alpha = (T)1.0, beta = (T)0.0;
-                       config_.outputSize,
+        cublas_gemm_ex(_cublasHandle,
-                       bsz,
+                       CUBLAS_OP_N,
-                       &alpha,
+                       CUBLAS_OP_T,
-                       &beta,
+                       config_.inputSize,
-                       input_ptr,
+                       config_.outputSize,
-                       out_grad,
+                       bsz,
-                       weights_grad,
+                       &alpha,
-                       cublasGemmAlgo_t(config_.gemm_algos[1]));
+                       &beta,
+                       input_ptr,
-        cublas_gemm_ex(_cublasHandle,
+                       out_grad,
-                       CUBLAS_OP_N,
+                       weights_grad,
-                       CUBLAS_OP_N,
+#ifdef __HIP_PLATFORM_HCC__
-                       config_.inputSize,
+                       rocblas_gemm_algo(config_.gemm_algos[1]));
-                       bsz,
+#else
-                       config_.outputSize,
+                       cublasGemmAlgo_t(config_.gemm_algos[1]));
-                       &alpha,
+#endif
-                       &beta,
-                       weights,
+        cublas_gemm_ex(_cublasHandle,
-                       out_grad,
+                       CUBLAS_OP_N,
-                       inp_grad_out,
+                       CUBLAS_OP_N,
-                       cublasGemmAlgo_t(config_.gemm_algos[2]));
+                       config_.inputSize,
+                       bsz,
-        launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
+                       config_.outputSize,
-    }
+                       &alpha,
+                       &beta,
-private:
+                       weights,
-    Config config_;
+                       out_grad,
-};
+                       inp_grad_out,
+#ifdef __HIP_PLATFORM_HCC__
-#endif
+                       rocblas_gemm_algo(config_.gemm_algos[2]));
+#else
+                       cublasGemmAlgo_t(config_.gemm_algos[2]));
+#endif
+        launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
+    }
+private:
+    Config config_;
+};
+#endif
--- a/csrc/includes/feed_forward_hip.h
+++ b/csrc/includes/feed_forward_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#ifndef __FEEDFORWARD_H__
+#define __FEEDFORWARD_H__
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include "custom_hip_layers.h"
+template <typename T>
+class FeedForward {
+public:
+    struct Config {
+        int batchSize, outputSize;
+        int inputSize;
+        std::array<int, 3> gemm_algos;
+        Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
+            : batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
+        {
+        }
+    };
+    FeedForward(Config config) : config_(config) {}
+    ~FeedForward() {}
+    void Forward(int bsz,
+                 const T* input_ptr,
+                 const T* weights,
+                 T* out,
+                 rocblas_handle& _cublasHandle)
+    {
+        float alpha = T(1.);
+        float beta = T(0.);
+        cublas_gemm_ex(_cublasHandle,
+                       rocblas_operation_transpose,
+                       rocblas_operation_none,
+                       config_.outputSize,
+                       bsz,
+                       config_.inputSize,
+                       &alpha,
+                       &beta,
+                       weights,
+                       input_ptr,
+                       out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[0]));
+#else
+                       cublasGemmAlgo_t(config_.gemm_algos[0]));
+#endif
+    }
+    void Backward(int bsz,
+                  const T* out_grad,
+                  const T* input_ptr,
+                  const T* weights,
+                  T* weights_grad,
+                  T* bias_grad,
+                  rocblas_handle& _cublasHandle,
+                  hipStream_t& stream,
+                  T* inp_grad_out = nullptr,
+                  T* out_grad_trans_out = nullptr)
+    {
+        float alpha = (T)1.0, beta = (T)0.0;
+        cublas_gemm_ex(_cublasHandle,
+                       rocblas_operation_none,
+                       rocblas_operation_transpose,
+                       config_.inputSize,
+                       config_.outputSize,
+                       bsz,
+                       &alpha,
+                       &beta,
+                       input_ptr,
+                       out_grad,
+                       weights_grad,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[1]));
+#else
+                       cublasGemmAlgo_t(config_.gemm_algos[1]));
+#endif
+        cublas_gemm_ex(_cublasHandle,
+                       rocblas_operation_none,
+                       rocblas_operation_none,
+                       config_.inputSize,
+                       bsz,
+                       config_.outputSize,
+                       &alpha,
+                       &beta,
+                       weights,
+                       out_grad,
+                       inp_grad_out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[2]));
+#else
+                       cublasGemmAlgo_t(config_.gemm_algos[2]));
+#endif
+        launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
+    }
+private:
+    Config config_;
+};
+#endif
--- a/csrc/includes/gelu.h
+++ b/csrc/includes/gelu.h
 #pragma once
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <stdio.h>
 #include "custom_cuda_layers.h"
 template <typename T>
 class Gelu {
 public:
    struct Config {
        uint32_t intermediate_size;
        Config(uint32_t inter_size) : intermediate_size(inter_size) {}
    };
    Gelu(const Config& config) : _config(config) {}
    virtual ~Gelu() {}
    void ForwardWithBiasAdd(int bsz,
                            const T* input_buf,
                            const T* bias,
                            T* output,
                            cudaStream_t stream)
    {
        launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
    }
    void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, cudaStream_t stream)
    {
        launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
    }
 private:
    Config _config;
 };
--- a/csrc/includes/gelu_hip.h
+++ b/csrc/includes/gelu_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include "custom_hip_layers.h"
+template <typename T>
+class Gelu {
+public:
+    struct Config {
+        uint32_t intermediate_size;
+        Config(uint32_t inter_size) : intermediate_size(inter_size) {}
+    };
+    Gelu(const Config& config) : _config(config) {}
+    virtual ~Gelu() {}
+    void ForwardWithBiasAdd(int bsz,
+                            const T* input_buf,
+                            const T* bias,
+                            T* output,
+                            hipStream_t stream)
+    {
+        launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
+    }
+    void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, hipStream_t stream)
+    {
+        launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
+    }
+private:
+    Config _config;
+};
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
 #pragma once
 #include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
+#ifndef __HIP_PLATFORM_HCC__
-#include <array>
+#include <cuda_profiler_api.h>
-#include <cstdio>
+#endif
-#include <cstdlib>
+#include <array>
-#include <ctime>
+#include <cstdio>
-#include <limits>
+#include <cstdlib>
-#include <memory>
+#include <ctime>
-#include "StopWatch.h"
+#include <limits>
-#include "cublas_wrappers.h"
+#include <memory>
+#include "StopWatch.h"
-template <typename T>
+#include "cublas_wrappers.h"
-void check(T result, char const* const func, const char* const file, int const line)
-{
+template <typename T>
-    if (result) {
+void check(T result, char const* const func, const char* const file, int const line)
-        std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
+{
-                      " \n");
+    if (result) {
-    }
+        std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
-}
+                      " \n");
+    }
-#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+}
-template <typename T>
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
-class GemmTest {
-public:
+template <typename T>
-    GemmTest(int m, int n, int k, cublasOperation_t ta, cublasOperation_t tb, cublasHandle_t h)
+class GemmTest {
-        : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
+public:
-    {
+    GemmTest(int m, int n, int k, cublasOperation_t ta, cublasOperation_t tb, cublasHandle_t h)
-        check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K));
+        : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-        check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N));
+    {
-        check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N));
+        check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K));
-    }
+        check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N));
+        check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N));
-    ~GemmTest()
+    }
-    {
-        check_cuda_error(cudaFree(A));
+    ~GemmTest()
-        check_cuda_error(cudaFree(B));
+    {
-        check_cuda_error(cudaFree(C));
+        check_cuda_error(cudaFree(A));
-    }
+        check_cuda_error(cudaFree(B));
+        check_cuda_error(cudaFree(C));
-    std::array<int, 3> TestAlgo(int loops)
+    }
-    {
-        float alpha = (T)1.0f;
+    std::array<int, 3> TestAlgo(int loops)
-        float beta = (T)0.0f;
+    {
+        float alpha = (T)1.0f;
-        int algo_fw = Run(loops, [=](int algo) {
+        float beta = (T)0.0f;
-            cublas_gemm_ex(handle,
-                           CUBLAS_OP_T,
+        int algo_fw = Run(loops, [=](int algo) {
-                           CUBLAS_OP_N,
+            cublas_gemm_ex(handle,
-                           N,
+                           CUBLAS_OP_T,
-                           M,
+                           CUBLAS_OP_N,
-                           K,
+                           N,
-                           &alpha,
+                           M,
-                           &beta,
+                           K,
-                           B,
+                           &alpha,
-                           A,
+                           &beta,
-                           C,
+                           B,
-                           static_cast<cublasGemmAlgo_t>(algo));
+                           A,
-        });
+                           C,
+#ifdef __HIP_PLATFORM_HCC__
-        int algo_bw1 = Run(loops, [=](int algo) {
+                           static_cast<rocblas_gemm_algo>(algo));
-            cublas_gemm_ex(handle,
+#else
-                           CUBLAS_OP_N,
+                           static_cast<cublasGemmAlgo_t>(algo));
-                           CUBLAS_OP_T,
+#endif
-                           K,
+        });
-                           N,
-                           M,
+        int algo_bw1 = Run(loops, [=](int algo) {
-                           &alpha,
+            cublas_gemm_ex(handle,
-                           &beta,
+                           CUBLAS_OP_N,
-                           A,
+                           CUBLAS_OP_T,
-                           C,
+                           K,
-                           B,
+                           N,
-                           static_cast<cublasGemmAlgo_t>(algo));
+                           M,
-        });
+                           &alpha,
+                           &beta,
-        int algo_bw2 = Run(loops, [=](int algo) {
+                           A,
-            cublas_gemm_ex(handle,
+                           C,
-                           CUBLAS_OP_N,
+                           B,
-                           CUBLAS_OP_N,
+#ifdef __HIP_PLATFORM_HCC__
-                           K,
+                           static_cast<rocblas_gemm_algo>(algo));
-                           M,
+#else
-                           N,
+                           static_cast<cublasGemmAlgo_t>(algo));
-                           &alpha,
+#endif
-                           &beta,
+        });
-                           B,
-                           C,
+        int algo_bw2 = Run(loops, [=](int algo) {
-                           A,
+            cublas_gemm_ex(handle,
-                           static_cast<cublasGemmAlgo_t>(algo));
+                           CUBLAS_OP_N,
-        });
+                           CUBLAS_OP_N,
+                           K,
-        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
+                           M,
-    }
+                           N,
+                           &alpha,
-    template <typename Func>
+                           &beta,
-    int Run(int loops, Func f)
+                           B,
-    {
+                           C,
-        float fast_latency = (std::numeric_limits<float>::max)();
+                           A,
-        int fast_algo = 0;
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
-        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+#else
-             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+                           static_cast<cublasGemmAlgo_t>(algo));
-             algo++) {
+#endif
-            int warm_up = 5;
+        });
-            for (int i = 0; i < warm_up; ++i) f(algo);
+        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
-            cudaDeviceSynchronize();
+    }
-            Stopwatch timer;
-            timer.Restart();
+    template <typename Func>
+    int Run(int loops, Func f)
-            for (int i = 0; i < loops; ++i) f(algo);
+    {
+        float fast_latency = (std::numeric_limits<float>::max)();
-            cudaDeviceSynchronize();
+        int fast_algo = 0;
-            timer.Stop();
+#ifdef __HIP_PLATFORM_HCC__
-            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
+#else
-            printf("algo-%d: %.3fms\n", algo, avg_latency);
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-            if (avg_latency < fast_latency) {
+#endif
-                fast_latency = avg_latency;
+             algo++) {
-                fast_algo = algo;
+            int warm_up = 5;
-            }
+            for (int i = 0; i < warm_up; ++i) f(algo);
-        }
+            cudaDeviceSynchronize();
-        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+            Stopwatch timer;
+            timer.Restart();
-        return fast_algo;
-    }
+            for (int i = 0; i < loops; ++i) f(algo);
-private:
+            cudaDeviceSynchronize();
-    int M, N, K;
+            timer.Stop();
-    cublasHandle_t handle;
-    cublasOperation_t transa, transb;
+            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
-    T *A, *B, *C;
-};
+            printf("algo-%d: %.3fms\n", algo, avg_latency);
-template <typename T>
+            if (avg_latency < fast_latency) {
-class StridedGemmTest {
+                fast_latency = avg_latency;
-public:
+                fast_algo = algo;
-    StridedGemmTest(int b,
+            }
-                    int m,
+        }
-                    int n,
-                    int k,
+        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
-                    cublasOperation_t ta,
-                    cublasOperation_t tb,
+        return fast_algo;
-                    cublasHandle_t h)
+    }
-        : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-    {
+private:
-        check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K * bsz));
+    int M, N, K;
-        check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N * bsz));
+    cublasHandle_t handle;
-        check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N * bsz));
+    cublasOperation_t transa, transb;
-    }
+    T *A, *B, *C;
+};
-    ~StridedGemmTest()
-    {
+template <typename T>
-        check_cuda_error(cudaFree(A));
+class StridedGemmTest {
-        check_cuda_error(cudaFree(B));
+public:
-        check_cuda_error(cudaFree(C));
+    StridedGemmTest(int b,
-    }
+                    int m,
+                    int n,
-    std::array<int, 3> TestAlgo(int loops)
+                    int k,
-    {
+                    cublasOperation_t ta,
-        float alpha = (T)1.0f;
+                    cublasOperation_t tb,
-        float beta = (T)0.0f;
+                    cublasHandle_t h)
+        : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-        int algo_fw = Run(loops, [=](int algo) {
+    {
-            int stride_a = M * K;
+        check_cuda_error(cudaMalloc((void**)&A, sizeof(T) * M * K * bsz));
-            int stride_b = N * K;
+        check_cuda_error(cudaMalloc((void**)&B, sizeof(T) * K * N * bsz));
-            int stride_c = M * N;
+        check_cuda_error(cudaMalloc((void**)&C, sizeof(T) * M * N * bsz));
+    }
-            cublas_strided_batched_gemm(handle,
-                                        M,
+    ~StridedGemmTest()
-                                        N,
+    {
-                                        K,
+        check_cuda_error(cudaFree(A));
-                                        &alpha,
+        check_cuda_error(cudaFree(B));
-                                        &beta,
+        check_cuda_error(cudaFree(C));
-                                        A,
+    }
-                                        B,
-                                        C,
+    std::array<int, 3> TestAlgo(int loops)
-                                        transa,
+    {
-                                        transb,
+        float alpha = (T)1.0f;
-                                        stride_a,
+        float beta = (T)0.0f;
-                                        stride_b,
-                                        stride_c,
+        int algo_fw = Run(loops, [=](int algo) {
-                                        bsz,
+            int stride_a = M * K;
-                                        static_cast<cublasGemmAlgo_t>(algo));
+            int stride_b = N * K;
-        });
+            int stride_c = M * N;
-        int algo_bw1 = Run(loops, [=](int algo) {
+            cublas_strided_batched_gemm(handle,
-            int mb = (transa == CUBLAS_OP_T ? K : M);
+                                        M,
-            int kb = (transa == CUBLAS_OP_T ? M : K);
+                                        N,
+                                        K,
-            int stride_a = mb * N;
+                                        &alpha,
-            int stride_b = N * kb;
+                                        &beta,
-            int stride_c = M * K;
+                                        A,
+                                        B,
-            // B need to transpose.
+                                        C,
-            cublasOperation_t op_b = (transb == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
+                                        transa,
+                                        transb,
-            // Calculate d_A.
+                                        stride_a,
-            cublas_strided_batched_gemm(handle,
+                                        stride_b,
-                                        mb,
+                                        stride_c,
-                                        kb,
+                                        bsz,
-                                        N,
+#ifdef __HIP_PLATFORM_HCC__
-                                        &alpha,
+                                        static_cast<rocblas_gemm_algo>(algo));
-                                        &beta,
+#else
-                                        (transa == CUBLAS_OP_T ? B : C),
+                                        static_cast<cublasGemmAlgo_t>(algo));
-                                        (transa == CUBLAS_OP_T ? C : B),
+#endif
-                                        A,
+        });
-                                        CUBLAS_OP_N,
-                                        op_b,
+        int algo_bw1 = Run(loops, [=](int algo) {
-                                        stride_a,
+            int mb = (transa == CUBLAS_OP_T ? K : M);
-                                        stride_b,
+            int kb = (transa == CUBLAS_OP_T ? M : K);
-                                        stride_c,
-                                        bsz,
+            int stride_a = mb * N;
-                                        static_cast<cublasGemmAlgo_t>(algo));
+            int stride_b = N * kb;
-        });
+            int stride_c = M * K;
-        int algo_bw2 = Run(loops, [=](int algo) {
+            // B need to transpose.
-            // A need to transpose.
+            cublasOperation_t op_b = (transb == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-            cublasOperation_t op_a = (transa == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
+            // Calculate d_A.
-            int stride_a = M * K;
+            cublas_strided_batched_gemm(handle,
-            int stride_b = M * N;
+                                        mb,
-            int stride_c = N * K;
+                                        kb,
+                                        N,
-            // Calculate d_B.
+                                        &alpha,
-            cublas_strided_batched_gemm(handle,
+                                        &beta,
-                                        K,
+                                        (transa == CUBLAS_OP_T ? B : C),
-                                        N,
+                                        (transa == CUBLAS_OP_T ? C : B),
-                                        M,
+                                        A,
-                                        &alpha,
+                                        CUBLAS_OP_N,
-                                        &beta,
+                                        op_b,
-                                        A,
+                                        stride_a,
-                                        C,
+                                        stride_b,
-                                        B,
+                                        stride_c,
-                                        op_a,
+                                        bsz,
-                                        CUBLAS_OP_N,
+#ifdef __HIP_PLATFORM_HCC__
-                                        stride_a,
+                                        static_cast<rocblas_gemm_algo>(algo));
-                                        stride_b,
+#else
-                                        stride_c,
+                                        static_cast<cublasGemmAlgo_t>(algo));
-                                        bsz,
+#endif
-                                        static_cast<cublasGemmAlgo_t>(algo));
+        });
-        });
+        int algo_bw2 = Run(loops, [=](int algo) {
-        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
+            // A need to transpose.
-    }
+            cublasOperation_t op_a = (transa == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-    template <typename Func>
+            int stride_a = M * K;
-    int Run(int loops, Func f)
+            int stride_b = M * N;
-    {
+            int stride_c = N * K;
-        float fast_latency = (std::numeric_limits<float>::max)();
-        int fast_algo = 0;
+            // Calculate d_B.
+            cublas_strided_batched_gemm(handle,
-        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+                                        K,
-             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+                                        N,
-             algo++) {
+                                        M,
-            int warm_up = 5;
+                                        &alpha,
-            for (int i = 0; i < warm_up; ++i) f(algo);
+                                        &beta,
+                                        A,
-            cudaDeviceSynchronize();
+                                        C,
-            Stopwatch timer;
+                                        B,
-            timer.Restart();
+                                        op_a,
+                                        CUBLAS_OP_N,
-            for (int i = 0; i < loops; ++i) f(algo);
+                                        stride_a,
+                                        stride_b,
-            cudaDeviceSynchronize();
+                                        stride_c,
-            timer.Stop();
+                                        bsz,
+#ifdef __HIP_PLATFORM_HCC__
-            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
-            printf("algo-%d: %.3fms\n", algo, avg_latency);
+                                        static_cast<cublasGemmAlgo_t>(algo));
+#endif
-            if (avg_latency < fast_latency) {
+        });
-                fast_latency = avg_latency;
-                fast_algo = algo;
+        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
-            }
+    }
-        }
+    template <typename Func>
-        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+    int Run(int loops, Func f)
+    {
-        return fast_algo;
+        float fast_latency = (std::numeric_limits<float>::max)();
-    }
+        int fast_algo = 0;
-private:
+#ifdef __HIP_PLATFORM_HCC__
-    int bsz, M, N, K;
+        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
-    cublasHandle_t handle;
+#else
-    cublasOperation_t transa, transb;
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-    T *A, *B, *C;
+             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-};
+#endif
+             algo++) {
+            int warm_up = 5;
+            for (int i = 0; i < warm_up; ++i) f(algo);
+            cudaDeviceSynchronize();
+            Stopwatch timer;
+            timer.Restart();
+            for (int i = 0; i < loops; ++i) f(algo);
+            cudaDeviceSynchronize();
+            timer.Stop();
+            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+            printf("algo-%d: %.3fms\n", algo, avg_latency);
+            if (avg_latency < fast_latency) {
+                fast_latency = avg_latency;
+                fast_algo = algo;
+            }
+        }
+        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+        return fast_algo;
+    }
+private:
+    int bsz, M, N, K;
+    cublasHandle_t handle;
+    cublasOperation_t transa, transb;
+    T *A, *B, *C;
+};
--- a/csrc/includes/gemm_test_hip.h
+++ b/csrc/includes/gemm_test_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <hip/hip_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
+#include <cuda_profiler_api.h>
+#endif
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <limits>
+#include <memory>
+#include "StopWatch.h"
+#include "cublas_wrappers_hip.h"
+template <typename T>
+void check(T result, char const* const func, const char* const file, int const line)
+{
+    if (result) {
+        std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
+                      " \n");
+    }
+}
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+template <typename T>
+class GemmTest {
+public:
+    GemmTest(int m, int n, int k, rocblas_operation ta, rocblas_operation tb, rocblas_handle h)
+        : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
+    {
+        check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K));
+        check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N));
+        check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N));
+    }
+    ~GemmTest()
+    {
+        check_cuda_error(hipFree(A));
+        check_cuda_error(hipFree(B));
+        check_cuda_error(hipFree(C));
+    }
+    std::array<int, 3> TestAlgo(int loops)
+    {
+        float alpha = (T)1.0f;
+        float beta = (T)0.0f;
+        int algo_fw = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           rocblas_operation_transpose,
+                           rocblas_operation_none,
+                           N,
+                           M,
+                           K,
+                           &alpha,
+                           &beta,
+                           B,
+                           A,
+                           C,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
+                           static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        int algo_bw1 = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           rocblas_operation_none,
+                           rocblas_operation_transpose,
+                           K,
+                           N,
+                           M,
+                           &alpha,
+                           &beta,
+                           A,
+                           C,
+                           B,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
+                           static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        int algo_bw2 = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           rocblas_operation_none,
+                           rocblas_operation_none,
+                           K,
+                           M,
+                           N,
+                           &alpha,
+                           &beta,
+                           B,
+                           C,
+                           A,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
+                           static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
+    }
+    template <typename Func>
+    int Run(int loops, Func f)
+    {
+        float fast_latency = (std::numeric_limits<float>::max)();
+        int fast_algo = 0;
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
+#else
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
+             algo++) {
+            int warm_up = 5;
+            for (int i = 0; i < warm_up; ++i) f(algo);
+            hipDeviceSynchronize();
+            Stopwatch timer;
+            timer.Restart();
+            for (int i = 0; i < loops; ++i) f(algo);
+            hipDeviceSynchronize();
+            timer.Stop();
+            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+            printf("algo-%d: %.3fms\n", algo, avg_latency);
+            if (avg_latency < fast_latency) {
+                fast_latency = avg_latency;
+                fast_algo = algo;
+            }
+        }
+        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+        return fast_algo;
+    }
+private:
+    int M, N, K;
+    rocblas_handle handle;
+    rocblas_operation transa, transb;
+    T *A, *B, *C;
+};
+template <typename T>
+class StridedGemmTest {
+public:
+    StridedGemmTest(int b,
+                    int m,
+                    int n,
+                    int k,
+                    rocblas_operation ta,
+                    rocblas_operation tb,
+                    rocblas_handle h)
+        : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
+    {
+        check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K * bsz));
+        check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N * bsz));
+        check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N * bsz));
+    }
+    ~StridedGemmTest()
+    {
+        check_cuda_error(hipFree(A));
+        check_cuda_error(hipFree(B));
+        check_cuda_error(hipFree(C));
+    }
+    std::array<int, 3> TestAlgo(int loops)
+    {
+        float alpha = (T)1.0f;
+        float beta = (T)0.0f;
+        int algo_fw = Run(loops, [=](int algo) {
+            int stride_a = M * K;
+            int stride_b = N * K;
+            int stride_c = M * N;
+            cublas_strided_batched_gemm(handle,
+                                        M,
+                                        N,
+                                        K,
+                                        &alpha,
+                                        &beta,
+                                        A,
+                                        B,
+                                        C,
+                                        transa,
+                                        transb,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
+                                        static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        int algo_bw1 = Run(loops, [=](int algo) {
+            int mb = (transa == rocblas_operation_transpose ? K : M);
+            int kb = (transa == rocblas_operation_transpose ? M : K);
+            int stride_a = mb * N;
+            int stride_b = N * kb;
+            int stride_c = M * K;
+            // B need to transpose.
+            rocblas_operation op_b = (transb == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
+            // Calculate d_A.
+            cublas_strided_batched_gemm(handle,
+                                        mb,
+                                        kb,
+                                        N,
+                                        &alpha,
+                                        &beta,
+                                        (transa == rocblas_operation_transpose ? B : C),
+                                        (transa == rocblas_operation_transpose ? C : B),
+                                        A,
+                                        rocblas_operation_none,
+                                        op_b,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
+                                        static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        int algo_bw2 = Run(loops, [=](int algo) {
+            // A need to transpose.
+            rocblas_operation op_a = (transa == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
+            int stride_a = M * K;
+            int stride_b = M * N;
+            int stride_c = N * K;
+            // Calculate d_B.
+            cublas_strided_batched_gemm(handle,
+                                        K,
+                                        N,
+                                        M,
+                                        &alpha,
+                                        &beta,
+                                        A,
+                                        C,
+                                        B,
+                                        op_a,
+                                        rocblas_operation_none,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
+                                        static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
+    }
+    template <typename Func>
+    int Run(int loops, Func f)
+    {
+        float fast_latency = (std::numeric_limits<float>::max)();
+        int fast_algo = 0;
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
+#else
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
+             algo++) {
+            int warm_up = 5;
+            for (int i = 0; i < warm_up; ++i) f(algo);
+            hipDeviceSynchronize();
+            Stopwatch timer;
+            timer.Restart();
+            for (int i = 0; i < loops; ++i) f(algo);
+            hipDeviceSynchronize();
+            timer.Stop();
+            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+            printf("algo-%d: %.3fms\n", algo, avg_latency);
+            if (avg_latency < fast_latency) {
+                fast_latency = avg_latency;
+                fast_algo = algo;
+            }
+        }
+        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+        return fast_algo;
+    }
+private:
+    int bsz, M, N, K;
+    rocblas_handle handle;
+    rocblas_operation transa, transb;
+    T *A, *B, *C;
+};