delete hip file

4acf0e01 · aiss · 7dd68788 · 7dd68788 · 7dd68788 · 7dd68788
Commit 4acf0e01 authored Apr 26, 2023 by aiss
20 changed files
--- a/csrc/adagrad/cpu_adagrad_hip.cpp
+++ b/csrc/adagrad/cpu_adagrad_hip.cpp
-// !!! This is a file automatically generated by hipify!!!
-#include "cpu_adagrad_hip.h"
-#include <hip/hip_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
-#include "custom_hip_layers.h"
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-// C++ interface
-void Adagrad_Optimizer::Step_1(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float step_size = -1 * _alpha;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = grads[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
-                variance += grad * grad;
-                grad = sqrt(variance);
-                grad += _eps;
-                grad = momentum / grad;
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                // STORE UPDATE TERM TO GRAD'S MEMORY
-                grads[k] = grad * step_size;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-void Adagrad_Optimizer::Step_4(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-int create_adagrad_optimizer(int optimizer_id,
-                             float alpha = 1e-2,
-                             float eps = 1e-8,
-                             float weight_decay = 0,
-                             bool should_log = false)
-{
-    auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
-    s_optimizers[optimizer_id] = opt;
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-        printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
-    }
-    return 0;
-}
-void Adagrad_Optimizer::Step_8(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-int ds_adagrad_step(int optimizer_id,
-                    size_t step,
-                    float lr,
-                    float epsilon,
-                    float weight_decay,
-                    torch::Tensor& params,
-                    torch::Tensor& grads,
-                    torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
-    opt->SynchronizeStreams();
-    return 0;
-}
-int ds_adagrad_step_plus_copy(int optimizer_id,
-                              size_t step,
-                              float lr,
-                              float epsilon,
-                              float weight_decay,
-                              torch::Tensor& params,
-                              torch::Tensor& grads,
-                              torch::Tensor& exp_avg_sq,
-                              torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-    opt->SynchronizeStreams();
-    return 0;
-}
-int destroy_adagrad_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-    return 0;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
-    m.def("adagrad_update_copy",
-          &ds_adagrad_step_plus_copy,
-          "DeepSpeed CPU Adagrad update and param copy (C++)");
-    m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
-    m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
-}
--- a/csrc/adam/cpu_adam_hip.cpp
+++ b/csrc/adam/cpu_adam_hip.cpp
-// !!! This is a file automatically generated by hipify!!!
-#include "cpu_adam_hip.h"
-#include <hip/hip_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
-#include "custom_hip_layers.h"
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-// C++ interface
-void Adam_Optimizer::Step_1(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float betta1_minus1 = 1 - _betta1;
-        float betta2_minus1 = 1 - _betta2;
-        float step_size = -1 * _alpha / _bias_correction1;
-        float w_decay = -1 * _alpha * _weight_decay;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = _exp_avg[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
-                momentum = momentum * _betta1;
-                momentum = grad * betta1_minus1 + momentum;
-                variance = variance * _betta2;
-                grad = grad * grad;
-                variance = grad * betta2_minus1 + variance;
-                grad = sqrt(variance);
-                grad = grad * _bias_correction2 + _eps;
-                grad = momentum / grad;
-                if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                _exp_avg[k] = momentum;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-void Adam_Optimizer::Step_4(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-int create_adam_optimizer(int optimizer_id,
-                          float alpha = 1e-3,
-                          float betta1 = 0.9,
-                          float betta2 = 0.999,
-                          float eps = 1e-8,
-                          float weight_decay = 0,
-                          bool adamw_mode = true,
-                          bool should_log = false)
-{
-    auto opt =
-        std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
-    s_optimizers[optimizer_id] = opt;
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-        printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
-               alpha,
-               betta1,
-               betta2,
-               weight_decay,
-               (int)adamw_mode);
-    }
-    return 0;
-}
-void Adam_Optimizer::Step_8(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-int ds_adam_step(int optimizer_id,
-                 size_t step,
-                 float lr,
-                 float beta1,
-                 float beta2,
-                 float epsilon,
-                 float weight_decay,
-                 bool bias_correction,
-                 torch::Tensor& params,
-                 torch::Tensor& grads,
-                 torch::Tensor& exp_avg,
-                 torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    // assert(params.options().dtype() == grads.options().dtype());
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                nullptr,
-                (params.options().dtype() == at::kHalf));
-    opt->SynchronizeStreams();
-    return 0;
-}
-int ds_adam_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float epsilon,
-                           float weight_decay,
-                           bool bias_correction,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& exp_avg_sq,
-                           torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-    opt->SynchronizeStreams();
-    return 0;
-}
-int destroy_adam_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-    return 0;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
-    m.def("adam_update_copy",
-          &ds_adam_step_plus_copy,
-          "DeepSpeed CPU Adam update and param copy (C++)");
-    m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
-    m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
-}
--- a/csrc/adam/custom_hip_kernel.hip
+++ b/csrc/adam/custom_hip_kernel.hip
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-__global__ void param_update_kernel(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    if (id < size) { output[id] = (__half)input[id]; }
-}
-void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
-{
-    int threads = 1024;
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-   hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
-}
--- a/csrc/adam/multi_tensor_adam.hip
+++ b/csrc/adam/multi_tensor_adam.hip
-// !!! This is a file automatically generated by hipify!!!
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/Exceptions.h>
-// Another possibility:
-// #include <torch/all.h>
-#include <assert.h>
-#include "multi_tensor_apply_hip.cuh"
-#include "type_shim_hip.h"
-#define BLOCK_SIZE 512
-#define ILP 4
-typedef enum {
-    ADAM_MODE_0 = 0,  // L2 regularization mode
-    ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
-} adamMode_t;
-using MATH_T = float;
-template <typename T>
-struct AdamFunctor {
-    __device__ __forceinline__ void operator()(int chunk_size,
-                                               volatile int* noop_gmem,
-                                               TensorListMetadata<4>& tl,
-                                               const float beta1,
-                                               const float beta2,
-                                               const float beta1_correction,
-                                               const float beta2_correction,
-                                               const float epsilon,
-                                               const float lr,
-                                               adamMode_t mode,
-                                               const float decay)
-    {
-        // I'd like this kernel to propagate infs/nans.
-        // if(*noop_gmem == 1)
-        //   return;
-        int tensor_loc = tl.block_to_tensor[blockIdx.x];
-        // potentially use to pass in list of scalar
-        // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
-        int chunk_idx = tl.block_to_chunk[blockIdx.x];
-        int n = tl.sizes[tensor_loc];
-        T* g = (T*)tl.addresses[0][tensor_loc];
-        g += chunk_idx * chunk_size;
-        T* p = (T*)tl.addresses[1][tensor_loc];
-        p += chunk_idx * chunk_size;
-        T* m = (T*)tl.addresses[2][tensor_loc];
-        m += chunk_idx * chunk_size;
-        T* v = (T*)tl.addresses[3][tensor_loc];
-        v += chunk_idx * chunk_size;
-        n -= chunk_idx * chunk_size;
-        // see note in multi_tensor_scale_kernel.cu
-        for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
-            MATH_T r_g[ILP];
-            MATH_T r_p[ILP];
-            MATH_T r_m[ILP];
-            MATH_T r_v[ILP];
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                int i = i_start + threadIdx.x + ii * blockDim.x;
-                if (i < n && i < chunk_size) {
-                    r_g[ii] = g[i];
-                    r_p[ii] = p[i];
-                    r_m[ii] = m[i];
-                    r_v[ii] = v[i];
-                } else {
-                    r_g[ii] = MATH_T(0);
-                    r_p[ii] = MATH_T(0);
-                    r_m[ii] = MATH_T(0);
-                    r_v[ii] = MATH_T(0);
-                }
-            }
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                if (mode == ADAM_MODE_0) {  // L2
-                    r_g[ii] = r_g[ii] + (decay * r_p[ii]);
-                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
-                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
-                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-                    MATH_T update = next_m_unbiased / denom;
-                    r_p[ii] = r_p[ii] - (lr * update);
-                } else {  // weight decay
-                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
-                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
-                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-                    MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
-                    r_p[ii] = r_p[ii] - (lr * update);
-                }
-            }
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                int i = i_start + threadIdx.x + ii * blockDim.x;
-                if (i < n && i < chunk_size) {
-                    p[i] = r_p[ii];
-                    m[i] = r_m[ii];
-                    v[i] = r_v[ii];
-                }
-            }
-        }
-    }
-};
-void multi_tensor_adam_cuda(int chunk_size,
-                            at::Tensor noop_flag,
-                            std::vector<std::vector<at::Tensor>> tensor_lists,
-                            const float lr,
-                            const float beta1,
-                            const float beta2,
-                            const float epsilon,
-                            const int step,
-                            const int mode,
-                            const int bias_correction,
-                            const float weight_decay)
-{
-    using namespace at;
-    // Handle bias correction mode
-    float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
-    if (bias_correction == 1) {
-        bias_correction1 = 1 - ::pow(beta1, step);
-        bias_correction2 = 1 - ::pow(beta2, step);
-    }
-    // Assume single type across p,g,m1,m2 now
-    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
-                                   0,
-                                   "adam",
-                                   multi_tensor_apply<4>(BLOCK_SIZE,
-                                                         chunk_size,
-                                                         noop_flag,
-                                                         tensor_lists,
-                                                         AdamFunctor<scalar_t_0>(),
-                                                         beta1,
-                                                         beta2,
-                                                         bias_correction1,
-                                                         bias_correction2,
-                                                         epsilon,
-                                                         lr,
-                                                         (adamMode_t)mode,
-                                                         weight_decay);)
-    AT_CUDA_CHECK(hipGetLastError());
-}
--- a/csrc/adam/multi_tensor_apply_hip.cuh
+++ b/csrc/adam/multi_tensor_apply_hip.cuh
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/Exceptions.h>
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-#include "compat.h"
-#include <assert.h>
-// #include <iostream>
-// This header is the one-stop shop for all your multi-tensor apply needs.
-// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
-constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
-constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
-template <int n>
-struct TensorListMetadata {
-    void* addresses[n][depth_to_max_tensors[n - 1]];
-    int sizes[depth_to_max_tensors[n - 1]];
-    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-    int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
-    int start_tensor_this_launch;
-};
-template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(int chunk_size,
-                                          volatile int* noop_flag,
-                                          T tl,
-                                          U callable,
-                                          ArgTypes... args)
-{
-    // Hand the chunk information to the user-supplied functor to process however it likes.
-    callable(chunk_size, noop_flag, tl, args...);
-}
-template <int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply(int block_size,
-                        int chunk_size,
-                        const at::Tensor& noop_flag,
-                        const std::vector<std::vector<at::Tensor>>& tensor_lists,
-                        T callable,
-                        ArgTypes... args)
-{
-    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
-    int len0 = tensor_lists[0].size();
-    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
-    auto ref_device = tensor_lists[0][0].device();
-    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
-    for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
-    {
-        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
-        for (int t = 0; t < tensor_lists[l].size(); t++) {
-            // TODO:  Print which tensor fails.
-            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
-#ifdef VERSION_GE_1_5
-            contiguous_memory = (contiguous_memory ||
-                                 tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
-#endif
-            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
-            TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
-                        "A tensor was not on the same device as the first tensor");
-            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
-        }
-    }
-    int ntensors = tensor_lists[0].size();
-    TensorListMetadata<depth> tl;
-    const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(tensor_lists[0][0]));
-    auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-    tl.start_tensor_this_launch = 0;
-    int loc_block_info = 0;
-    int loc_tensor_info = 0;
-    for (int t = 0; t < ntensors; t++) {
-        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
-        for (int d = 0; d < depth; d++)
-            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
-        loc_tensor_info++;
-        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
-        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
-            // std::cout << chunks_this_tensor << std::endl;
-            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-            tl.block_to_chunk[loc_block_info] = chunk;
-            loc_block_info++;
-            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
-                                 chunk == chunks_this_tensor - 1);
-            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
-            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
-            if (tensors_full || blocks_full || last_chunk) {
-                // using accscalar_t = acc_type<scalar_t, true>;
-               hipLaunchKernelGGL(( multi_tensor_apply_kernel), dim3(loc_block_info), dim3(block_size), 0, stream, 
-                    chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
-                AT_CUDA_CHECK(hipGetLastError());
-                // Reset.  The control flow possibilities here make my brain hurt.
-                loc_block_info = 0;
-                if (chunk == chunks_this_tensor - 1) {
-                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
-                    // std::endl;
-                    loc_tensor_info = 0;
-                    tl.start_tensor_this_launch = t + 1;
-                } else {
-                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
-                    // std::endl;
-                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
-                    for (int d = 0; d < depth; d++)
-                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
-                    loc_tensor_info = 1;
-                    tl.start_tensor_this_launch = t;
-                }
-            }
-        }
-    }
-}
--- a/csrc/common/custom_hip_kernel.hip
+++ b/csrc/common/custom_hip_kernel.hip
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-__global__ void param_update_kernel(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    if (id < size) { output[id] = (__half)input[id]; }
-}
-void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
-{
-    int threads = 1024;
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-   hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
-}
-__global__ void param_update_kernel_half(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    __half2* output_cast = reinterpret_cast<__half2*>(output);
-    if (id < size) {
-        float input_f = input[id];
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-        output_cast[id] = *input_h;
-    }
-}
-void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream)
-{
-    int threads = 1024;
-    size /= 2;
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-   hipLaunchKernelGGL(( param_update_kernel_half), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
-}
--- a/csrc/includes/Timer_hip.h
+++ b/csrc/includes/Timer_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#ifndef __TIMER_H__
-#define __TIMER_H__
-#include <hip/hip_runtime.h>
-#include <chrono>
-#include "hip/hip_runtime.h"
-class GPUTimer {
-    hipEvent_t start, stop;
-public:
-    GPUTimer()
-    {
-        hipEventCreate(&start);
-        hipEventCreate(&stop);
-    }
-    ~GPUTimer()
-    {
-        hipEventDestroy(start);
-        hipEventDestroy(stop);
-    }
-    inline void Record() { hipEventRecord(start); }
-    inline void Elapsed(float& time_elapsed)
-    {
-        hipEventRecord(stop);
-        hipEventSynchronize(stop);
-        hipEventElapsedTime(&time_elapsed, start, stop);
-    }
-};
-class CPUTimer {
-    std::chrono::high_resolution_clock::time_point start;
-public:
-    CPUTimer() : start(std::chrono::high_resolution_clock::now()) {}
-    inline void Reset() { start = std::chrono::high_resolution_clock::now(); }
-    inline float Elapsed()
-    {
-        auto temp = start;
-        start = std::chrono::high_resolution_clock::now();
-        return (float)(std::chrono::duration_cast<std::chrono::microseconds>(start - temp).count() /
-                       1e3);
-    }
-};
-#endif
--- a/csrc/includes/context_hip.h
+++ b/csrc/includes/context_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#include <ATen/hip/HIPContext.h>
-#include <hip/hip_runtime_api.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
-#include "gemm_test_hip.h"
-#define WARP_SIZE 32
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        hipError_t error_code = callstr;                                                      \
-        if (error_code != hipSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-#define CUDA_1D_KERNEL_LOOP(i, n) \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
-        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
-#define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 262144
-inline int DS_GET_BLOCKS(const int N)
-{
-    return (std::max)(
-        (std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
-        // Use at least 1 block, since CUDA does not allow empty block
-        1);
-}
-class Context {
-public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
-    {
-        hiprandCreateGenerator(&_gen, HIPRAND_RNG_PSEUDO_DEFAULT);
-        hiprandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (rocblas_create_handle(&_cublasHandle) != rocblas_status_success) {
-            auto message = std::string("Fail to create cublas handle.");
-            std::cerr << message << std::endl;
-            throw std::runtime_error(message);
-        }
-    }
-    virtual ~Context()
-    {
-        rocblas_destroy_handle(_cublasHandle);
-        hipFree(_workspace);
-    }
-    static Context& Instance()
-    {
-        static Context _ctx;
-        return _ctx;
-    }
-    void SetWorkSpace(void* workspace)
-    {
-        if (!workspace) { throw std::runtime_error("Workspace is null."); }
-        _workspace = workspace;
-    }
-    void* GetWorkSpace() { return _workspace; }
-    hiprandGenerator_t& GetRandGenerator() { return _gen; }
-    hipStream_t GetCurrentStream()
-    {
-        // get current pytorch stream.
-        hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-        return stream;
-    }
-    hipStream_t GetNewStream() { return at::hip::getStreamFromPoolMasqueradingAsCUDA(); }
-    rocblas_handle GetCublasHandle() { return _cublasHandle; }
-    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
-    {
-        uint64_t offset = _curr_offset;
-        _curr_offset += offset_inc;
-        return std::pair<uint64_t, uint64_t>(_seed, offset);
-    }
-    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
-    void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head)
-    {
-        // avoid rerun.
-        if (_gemm_algos.size() > 0) return;
-        if (test_gemm) {
-            rocblas_handle handle = GetCublasHandle();
-            std::unique_ptr<GemmTest<__half>> test_qkv_fw(
-                new GemmTest<__half>(batch_size * seq_len,      // M
-                                     head_num * size_per_head,  // N
-                                     head_num * size_per_head,  // K
-                                     rocblas_operation_transpose,
-                                     rocblas_operation_none,
-                                     handle));
-            std::unique_ptr<GemmTest<__half>> test_inter(
-                new GemmTest<__half>(batch_size * seq_len,          // M
-                                     4 * head_num * size_per_head,  // N
-                                     head_num * size_per_head,      // K
-                                     rocblas_operation_transpose,
-                                     rocblas_operation_none,
-                                     handle));
-            std::unique_ptr<GemmTest<__half>> test_output(
-                new GemmTest<__half>(batch_size * seq_len,          // M
-                                     head_num * size_per_head,      // N
-                                     4 * head_num * size_per_head,  // K
-                                     rocblas_operation_transpose,
-                                     rocblas_operation_none,
-                                     handle));
-            std::unique_ptr<StridedGemmTest<__half>> test_attn_scores(
-                new StridedGemmTest<__half>(batch_size * head_num,  // batch
-                                            seq_len,                // M
-                                            seq_len,                // N
-                                            size_per_head,          // K
-                                            rocblas_operation_transpose,
-                                            rocblas_operation_none,
-                                            handle));
-            std::unique_ptr<StridedGemmTest<__half>> test_attn_context(
-                new StridedGemmTest<__half>(batch_size * head_num,  // batch
-                                            size_per_head,          // M
-                                            seq_len,                // N
-                                            seq_len,                // K
-                                            rocblas_operation_none,
-                                            rocblas_operation_none,
-                                            handle));
-            _gemm_algos.push_back(test_qkv_fw->TestAlgo(100));
-            _gemm_algos.push_back(test_inter->TestAlgo(100));
-            _gemm_algos.push_back(test_output->TestAlgo(100));
-            _gemm_algos.push_back(test_attn_scores->TestAlgo(100));
-            _gemm_algos.push_back(test_attn_context->TestAlgo(100));
-        } else {
-            // Use default algo.
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-        }
-    }
-    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
-private:
-    hiprandGenerator_t _gen;
-    rocblas_handle _cublasHandle;
-    void* _workspace;
-    uint64_t _seed;
-    uint64_t _curr_offset;
-    std::vector<std::array<int, 3>> _gemm_algos;
-};
--- a/csrc/includes/cpu_adagrad_hip.h
+++ b/csrc/includes/cpu_adagrad_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#define NOMINMAX  // Windows idiosyncrasy
-                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-#include "simd.h"
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-class Adagrad_Optimizer {
-public:
-    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
-        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
-    {
-        hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
-        hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adagrad_Optimizer()
-    {
-        hipHostFree(_doubled_buffer[0]);
-        hipHostFree(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step)
-    {
-        _step++;
-        if (_step != step) { _step = step; }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-    }
-private:
-    float _alpha;
-    float _eps;
-    float _weight_decay;
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-    float* _doubled_buffer[2];
-    bool _buf_index;
-    hipStream_t _streams[2];
-};
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
-                                 float* _params,
-                                 float* grads,
-                                 float* _exp_avg_sq,
-                                 size_t _param_size,
-                                 __half* dev_params,
-                                 bool half_precision)
-{
-    size_t new_rounded_size = 0;
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-    float step_size = -1 * _alpha;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, grads + i, false);
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-            if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
-            simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_add<span>(grad_4, grad_4, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
--- a/csrc/includes/cpu_adam_hip.h
+++ b/csrc/includes/cpu_adam_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#define NOMINMAX  // Windows idiosyncrasy
-                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-#include "simd.h"
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg,             \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-class Adam_Optimizer {
-public:
-    Adam_Optimizer(float alpha = 1e-3,
-                   float betta1 = 0.9,
-                   float betta2 = 0.999,
-                   float eps = 1e-8,
-                   float weight_decay = 0,
-                   bool adamw_mode = true)
-        : _alpha(alpha),
-          _betta1(betta1),
-          _betta2(betta2),
-          _eps(eps),
-          _weight_decay(weight_decay),
-          _betta1_t(1.0),
-          _betta2_t(1.0),
-          _step(0),
-          _buf_index(false),
-          _adamw_mode(adamw_mode)
-    {
-        hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
-        hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adam_Optimizer()
-    {
-        hipHostFree(_doubled_buffer[0]);
-        hipHostFree(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step, float beta1, float beta2)
-    {
-        if (beta1 != _betta1 || beta2 != _betta2) {
-            _step = step;
-            _betta1 = beta1;
-            _betta2 = beta2;
-            _betta1_t = std::pow(_betta1, step);
-            _betta2_t = std::pow(_betta2, step);
-        } else {
-            _step++;
-            if (_step != step) {
-                _betta1_t = std::pow(_betta1, step);
-                _betta2_t = std::pow(_betta2, step);
-                _step = step;
-            } else {
-                _betta1_t *= _betta1;
-                _betta2_t *= _betta2;
-            }
-        }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-        _bias_correction1 = 1.0f;
-        _bias_correction2 = 1.0f;
-        if (bias_correction == 1) {
-            _bias_correction1 = 1 - _betta1_t;
-            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
-        }
-    }
-private:
-    float _alpha;
-    float _betta1;
-    float _betta2;
-    float _eps;
-    float _weight_decay;
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-    float _bias_correction1;
-    float _bias_correction2;
-    float* _doubled_buffer[2];
-    bool _buf_index;
-    bool _adamw_mode;
-    hipStream_t _streams[2];
-};
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adam_Optimizer::Step_AVX(size_t* rounded_size,
-                              float* _params,
-                              float* grads,
-                              float* _exp_avg,
-                              float* _exp_avg_sq,
-                              size_t _param_size,
-                              __half* dev_params,
-                              bool half_precision)
-{
-    size_t new_rounded_size = 0;
-    AVX_Data betta1_4;
-    betta1_4.data = SIMD_SET(_betta1);
-    AVX_Data betta2_4;
-    betta2_4.data = SIMD_SET(_betta2);
-    float betta1_minus1 = 1 - _betta1;
-    float betta2_minus1 = 1 - _betta2;
-    AVX_Data betta1_minus1_4;
-    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
-    AVX_Data betta2_minus1_4;
-    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
-    AVX_Data bias2_sqrt;
-    bias2_sqrt.data = SIMD_SET(_bias_correction2);
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-    float step_size = -1 * _alpha / _bias_correction1;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-    float w_decay = -1 * _alpha * _weight_decay;
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0)
-        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, _exp_avg + i, false);
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-            if (_weight_decay > 0 && !_adamw_mode) {
-                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
-            }
-            simd_mul<span>(momentum_4, momentum_4, betta1_4);
-            simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
-            simd_mul<span>(variance_4, variance_4, betta2_4);
-            simd_mul<span>(grad_4, grad_4, grad_4);
-            simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-            if (_weight_decay > 0 && _adamw_mode) {
-                simd_fma<span>(param_4, param_4, weight_decay4, param_4);
-            }
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg + i, momentum_4, false);
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
--- a/csrc/includes/cublas_wrappers_hip.h
+++ b/csrc/includes/cublas_wrappers_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#include <assert.h>
-#include <rocblas.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#ifndef __HIP_PLATFORM_HCC__
-#include <mma.h>
-#endif
-#include <stdio.h>
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-#endif
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-#endif
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
--- a/csrc/includes/custom_hip_layers.h
+++ b/csrc/includes/custom_hip_layers.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-#include <hiprand/hiprand_kernel.h>
-#include "context_hip.h"
-#include "cublas_wrappers_hip.h"
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        hipError_t error_code = callstr;                                                      \
-        if (error_code != hipSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-#define MAX_THREADS 1024
-#define THREADS 256
-#define MAX_THREAD_STRIDE 32
-#define TILE_DIM 32
-// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
-// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
-#define MAX_THREAD_ITERATIONS 8  // Maximum 8K
-#define MAX_WARP_NUM 32
-#define MAX_REGISTERS 256
-#define MAX_REG 256
-#define WARP_SIZE_BITS 5
-template <typename T>
-void launch_quantize_kernel(T* vals,
-                            int total_count,
-                            int group_num,
-                            int num_bits,
-                            hipStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel(T* vals,
-                               int total_count,
-                               int group_num,
-                               int num_bits,
-                               hipStream_t stream);
-template <typename T>
-void launch_quantize_kernel_asym(T* vals,
-                                 int total_count,
-                                 int group_num,
-                                 int num_bits,
-                                 hipStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel_asym(T* vals,
-                                    int total_count,
-                                    int group_num,
-                                    int num_bits,
-                                    hipStream_t stream);
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(const T* input,
-                      const T* bias,
-                      T* output,
-                      int intermediate_size,
-                      int batch_size,
-                      hipStream_t stream);
-template <typename T>
-void launch_gelu(const T* input,
-                 T* output,
-                 int intermediate_size,
-                 int batch_size,
-                 hipStream_t stream);
-template <typename T>
-void launch_d_gelu(T* d_output,
-                   const T* input,
-                   const T* bias,
-                   int intermediate_size,
-                   int batch_size,
-                   hipStream_t stream);
-// Custom fused bias add with layer normalization
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     hipStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars,
-                                     T* means);
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     hipStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars);
-template <typename T>
-void launch_layerNorm_backward_fused_add(const T* out_grad1,
-                                         const T* out_grad2,
-                                         const T* X_data,
-                                         const T* vars,
-                                         const T* means,
-                                         const T* gamma,
-                                         T* gamma_grad,
-                                         T* betta_grad,
-                                         T* inp_grad,
-                                         int batch_size,
-                                         int hidden_dim,
-                                         hipStream_t stream[2]);
-template <typename T>
-void launch_layerNorm_backward_fused_add(const T* out_grad1,
-                                         const T* out_grad2,
-                                         const T* vals_hat,
-                                         const T* vars,
-                                         const T* gamma,
-                                         T* gamma_grad,
-                                         T* betta_grad,
-                                         T* inp_grad,
-                                         int batch_size,
-                                         int hidden_dim,
-                                         hipStream_t stream[2],
-                                         bool invertible = false,
-                                         const T* betta = nullptr);
-template <typename T>
-void launch_layerNorm_backward(const T* out_grad,
-                               const T* X_data,
-                               const T* vars,
-                               const T* means,
-                               const T* gamma,
-                               T* gamma_grad,
-                               T* betta_grad,
-                               T* inp_grad,
-                               int batch_size,
-                               int hidden_dim,
-                               hipStream_t stream[2]);
-template <typename T>
-void launch_layerNorm_backward(const T* out_grad,
-                               const T* vals_hat,
-                               const T* vars,
-                               const T* gamma,
-                               T* gamma_grad,
-                               T* betta_grad,
-                               T* inp_grad,
-                               int batch_size,
-                               int hidden_dim,
-                               hipStream_t stream[2],
-                               bool invertible = false,
-                               const T* betta = nullptr);
-template <typename T>
-void launch_layerNorm_backward_nreversible(const T* out_grad,
-                                           const T* vals,
-                                           const T* out_grad_trans,
-                                           const T* vals_trans,
-                                           const T* means,
-                                           const T* vars,
-                                           const T* gamma,
-                                           T* gamma_grad,
-                                           T* betta_grad,
-                                           T* inp_grad,
-                                           int batch_size,
-                                           int hidden_dim,
-                                           hipStream_t stream[2]);
-template <typename T>
-void Transpose(const T* inp_mat, T* out_mat, int rows, int cols, hipStream_t stream);
-template <typename T>
-void launch_attn_softmax_backward(T* out_grad,
-                                  const T* soft_inp,
-                                  int batch_size,
-                                  int heads,
-                                  int seq_length,
-                                  hipStream_t stream);
-template <typename T>
-void launch_attn_softmax_backward_v2(T* out_grad,
-                                     const T* soft_inp,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     hipStream_t stream);
-// Custom softmax with scaling and attention mask addition
-template <typename T>
-void launch_attn_softmax(T* vals,
-                         const T* attn_mask,
-                         int batch_size,
-                         int heads,
-                         int sequence_length,
-                         hipStream_t stream);
-template <typename T>
-void launch_transform_0213(T* output,
-                           const T* vals,
-                           int batch_size,
-                           int seq_length,
-                           int hidden_dim,
-                           int heads,
-                           hipStream_t stream);
-// Custom bias add
-template <typename T>
-void launch_bias_add_transform_0213(T* outputs,
-                                    const T* vals,
-                                    const T* bias,
-                                    int batch_size,
-                                    int seq_length,
-                                    int hidden_dim,
-                                    int heads,
-                                    hipStream_t stream,
-                                    int trans_count);
-// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
-template <typename T>
-void launch_transform4d_0213(T* out,
-                             const T* in,
-                             int batch_size,
-                             int heads,
-                             int seq_length,
-                             int hidden_dim,
-                             hipStream_t stream,
-                             int trans_count);
-template <typename T>
-void launch_dropout(T* vals,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream);
-template <typename T>
-void launch_dropout(T* vals_out,
-                    const T* vals,
-                    uint8_t* mask,
-                    int total_count,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream,
-                    bool bwd = false);
-template <typename T>
-void launch_dropout(T* out,
-                    const T* vals,
-                    const T* residual,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    hipStream_t stream);
-template <typename T>
-void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, hipStream_t stream);
-template <typename T>
-void launch_dropout_grad(T* vals_out,
-                         const T* vals,
-                         uint8_t* mask,
-                         int total_count,
-                         float ratio,
-                         hipStream_t stream);
-template <typename T>
-void launch_fuse_transpose_bias_kernel(const T* inp,
-                                       T* out,
-                                       int rows,
-                                       int cols,
-                                       hipStream_t stream);
-void launch_param_update(const float* input, __half* output, int size, hipStream_t stream);
-void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream);
--- a/csrc/includes/dropout_hip.h
+++ b/csrc/includes/dropout_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-template <typename T>
-class Dropout {
-public:
-    struct Config {
-        float ratio;
-        uint32_t dim;
-        bool training;
-        Config(float r, uint32_t d) : ratio(r), dim(d), training(true) {}
-        float RATIO() const { return training ? ratio : 0.0; }
-        inline void SetDim(uint32_t d) { dim = d; }
-    };
-    Dropout(const Config& config) : _config(config), _mask(nullptr) {}
-    virtual ~Dropout() {}
-    void Forward(int bsz, T* out, const T* vals, hipStream_t stream, bool bwd = false)
-    {
-        launch_dropout<T>(
-            out, vals, _mask, bsz * _config.dim, _config.dim, _config.RATIO(), stream, bwd);
-    }
-    void ForwardWithBias(int bsz, T* vals, const T* bias, hipStream_t stream)
-    {
-        launch_dropout<T>(vals, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
-    }
-    void ForwardWithBias(int bsz,
-                         T* out,
-                         const T* vals,
-                         const T* residual,
-                         const T* bias,
-                         hipStream_t stream)
-    {
-        launch_dropout<T>(
-            out, vals, residual, bias, _mask, bsz, _config.dim, _config.RATIO(), stream);
-    }
-    void Backward(int bsz, T* d_vals, hipStream_t stream)
-    {
-        launch_dropout_grad<T>(d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
-    }
-    void Backward(int bsz, T* d_vals_out, const T* d_vals, hipStream_t stream)
-    {
-        launch_dropout_grad<T>(
-            d_vals_out, d_vals, _mask, bsz * _config.dim, _config.RATIO(), stream);
-    }
-    bool HasDropout() const { return _config.RATIO() > 0.0; }
-    void SetTrainingMode(bool training) { _config.training = training; }
-    void SetMask(uint8_t* mask)
-    {
-        if (!mask) { throw std::runtime_error("Dropout mask is null."); }
-        _mask = mask;
-    }
-    Config GetConfig() const { return _config; }
-    inline void SetDimension(uint32_t dim) { _config.SetDim(dim); }
-private:
-    uint8_t* _mask;
-    Config _config;
-};
--- a/csrc/includes/ds_transformer_hip.h
+++ b/csrc/includes/ds_transformer_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#include <hip/hip_runtime_api.h>
-#include <hiprand/hiprand.h>
-#include <memory>
-#include <vector>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "dropout_hip.h"
-#include "feed_forward_hip.h"
-#include "gelu_hip.h"
-#include "general_kernels_hip.h"
-#include "normalize_layer_hip.h"
-#include "softmax_hip.h"
-#include "strided_batch_gemm_hip.h"
-struct BertGemmAlgos {
-    int m_gemm_qkv_algo;
-    int m_gemm_inter_algo;
-    int m_gemm_output_algo;
-    int m_gemm_batch1_algo;
-    int m_gemm_batch2_algo;
-    BertGemmAlgos()
-        : m_gemm_qkv_algo(-1),
-          m_gemm_inter_algo(-1),
-          m_gemm_output_algo(-1),
-          m_gemm_batch1_algo(-1),
-          m_gemm_batch2_algo(-1)
-    {
-    }
-};
-template <typename T>
-class BertTransformerLayer {
-public:
-    BertTransformerLayer(unsigned layer_id,
-                         unsigned batch_size,
-                         unsigned hidden_size,
-                         unsigned num_heads,
-                         unsigned intermediate_size,
-                         unsigned seq_length,
-                         float attn_dropout_ratio,
-                         float hidden_output_dropout_ratio,
-                         float layer_norm_eps,
-                         bool pre_or_postLayerNorm,
-                         const std::vector<std::array<int, 3>>& gemm_algos,
-                         bool attn_dropout_checkpoint,
-                         bool normalize_invertible,
-                         bool gelu_checkpoint,
-                         bool stochastic_mode);
-    virtual ~BertTransformerLayer();
-    void Forward(unsigned bsz,
-                 const T* input_ptr,
-                 const T* input_mask_ptr,
-                 const T* attn_qkvw_ptr,
-                 const T* attn_qkvb_ptr,
-                 const T* attn_ow_ptr,
-                 const T* attn_ob_ptr,
-                 const T* attn_nw_ptr,
-                 const T* attn_nb_ptr,
-                 const T* inter_w_ptr,
-                 const T* inter_b_ptr,
-                 const T* output_w_ptr,
-                 const T* output_b_ptr,
-                 const T* norm_w_ptr,
-                 const T* norm_b_ptr,
-                 T* out_ptr,
-                 T* inp_norm_ptr,
-                 T* q_tf_ptr,
-                 T* k_tf_ptr,
-                 T* v_tf_ptr,
-                 T* softmax_output_ptr,
-                 T* ctx_bufB_ptr,
-                 T* attn_o_inp_ptr,
-                 T* add_res_ptr,
-                 T* ff1_inp_ptr,
-                 T* gelu_inp_ptr,
-                 T* ff2_inp_ptr);
-    void Backward(unsigned bsz,
-                  const T* grad_output_ptr,
-                  const T* input_ptr,
-                  const T* output_ptr,
-                  const T* inp_norm_ptr,
-                  const T* q_tf_ptr,
-                  const T* k_tf_ptr,
-                  const T* v_tf_ptr,
-                  const T* softmax_output_ptr,
-                  const T* ctx_bufB_ptr,
-                  const T* attn_o_inp_ptr,
-                  const T* add_res_ptr,
-                  const T* ff1_inp_ptr,
-                  const T* gelu_inp_ptr,
-                  const T* ff2_inp_ptr,
-                  const T* input_mask_ptr,
-                  const T* attn_qkvw_ptr,
-                  const T* attn_ow_ptr,
-                  const T* attn_nw_ptr,
-                  const T* attn_nb_ptr,
-                  const T* inter_w_ptr,
-                  const T* inter_b_ptr,
-                  const T* output_w_ptr,
-                  const T* norm_w_ptr,
-                  const T* norm_b_ptr,
-                  T* grad_input_ptr,
-                  T* grad_attn_qkvw_ptr,
-                  T* grad_attn_qkvb_ptr,
-                  T* grad_attn_ow_ptr,
-                  T* grad_attn_ob_ptr,
-                  T* grad_attn_nw_ptr,
-                  T* grad_attn_nb_ptr,
-                  T* grad_inter_w_ptr,
-                  T* grad_inter_b_ptr,
-                  T* grad_output_w_ptr,
-                  T* grad_output_b_ptr,
-                  T* grad_norm_w_ptr,
-                  T* grad_norm_b_ptr);
-    void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
-                                uint8_t* attn_output_dropout_mask_ptr,
-                                uint8_t* layer_output_dropout_mask_ptr,
-                                T* layer_norm_var,
-                                T* layer_norm_mean,
-                                T* attn_layer_norm_var,
-                                T* attn_layer_norm_mean);
-    inline unsigned GetBatchSize() const { return _batch_size; }
-    inline unsigned GetNumHeads() const { return _heads; }
-    inline unsigned GetSeqLength() const { return _seq_length; }
-    inline unsigned GetIntermediateSize() const { return _intermediate_size; }
-    void SetSeqLength(unsigned seq_len);
-    inline unsigned GetHiddenSize() const { return _hidden_size; }
-    void SetTrainingMode(bool training);
-    inline bool IsTrainingMode() const { return _training; }
-    inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
-private:
-    void Initialize();
-    size_t getWorkspaceSize(int maxBatchSize) const;
-    // Params
-    unsigned _layer_id;
-    unsigned _batch_size;
-    unsigned _hidden_size;
-    unsigned _heads;
-    unsigned _size_per_head;
-    unsigned _intermediate_size;
-    unsigned _seq_length;
-    bool _pre_or_postLayerNorm;
-    rocblas_handle _cublasHandle;
-    hipStream_t _stream;
-    // layers
-    FeedForward<T> _qkv_linear;
-    FeedForward<T> _attn_out_linear;
-    Normalize_Layer<T> _attn_layer_norm;
-    Normalize_Layer<T> _layer_norm;
-    Normalize_Layer<T>* _last_normalize;
-    FeedForward<T> _ff1, _ff2;
-    Softmax<T> _softmax;
-    Gelu<T> _gelu;
-    Dropout<T> _attn_prob_dropout;
-    Dropout<T> _attn_output_dropout;
-    Dropout<T> _layer_output_dropout;
-    StridedBatchGemm<T> _attn_scores;
-    StridedBatchGemm<T> _attn_context;
-    bool _training;
-    // Memory saving flags
-    bool _attn_dropout_checkpoint;
-    bool _normalize_invertible;
-    bool _gelu_checkpoint;
-    // High Performance flags
-    bool _stochastic_mode;
-};
--- a/csrc/includes/feed_forward_hip.h
+++ b/csrc/includes/feed_forward_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#ifndef __FEEDFORWARD_H__
-#define __FEEDFORWARD_H__
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include "custom_hip_layers.h"
-template <typename T>
-class FeedForward {
-public:
-    struct Config {
-        int batchSize, outputSize;
-        int inputSize;
-        std::array<int, 3> gemm_algos;
-        Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
-            : batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
-        {
-        }
-    };
-    FeedForward(Config config) : config_(config) {}
-    ~FeedForward() {}
-    void Forward(int bsz,
-                 const T* input_ptr,
-                 const T* weights,
-                 T* out,
-                 rocblas_handle& _cublasHandle)
-    {
-        float alpha = T(1.);
-        float beta = T(0.);
-        cublas_gemm_ex(_cublasHandle,
-                       rocblas_operation_transpose,
-                       rocblas_operation_none,
-                       config_.outputSize,
-                       bsz,
-                       config_.inputSize,
-                       &alpha,
-                       &beta,
-                       weights,
-                       input_ptr,
-                       out,
-#ifdef __HIP_PLATFORM_HCC__
-                       rocblas_gemm_algo(config_.gemm_algos[0]));
-#else
-                       cublasGemmAlgo_t(config_.gemm_algos[0]));
-#endif
-    }
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* input_ptr,
-                  const T* weights,
-                  T* weights_grad,
-                  T* bias_grad,
-                  rocblas_handle& _cublasHandle,
-                  hipStream_t& stream,
-                  T* inp_grad_out = nullptr,
-                  T* out_grad_trans_out = nullptr)
-    {
-        float alpha = (T)1.0, beta = (T)0.0;
-        cublas_gemm_ex(_cublasHandle,
-                       rocblas_operation_none,
-                       rocblas_operation_transpose,
-                       config_.inputSize,
-                       config_.outputSize,
-                       bsz,
-                       &alpha,
-                       &beta,
-                       input_ptr,
-                       out_grad,
-                       weights_grad,
-#ifdef __HIP_PLATFORM_HCC__
-                       rocblas_gemm_algo(config_.gemm_algos[1]));
-#else
-                       cublasGemmAlgo_t(config_.gemm_algos[1]));
-#endif
-        cublas_gemm_ex(_cublasHandle,
-                       rocblas_operation_none,
-                       rocblas_operation_none,
-                       config_.inputSize,
-                       bsz,
-                       config_.outputSize,
-                       &alpha,
-                       &beta,
-                       weights,
-                       out_grad,
-                       inp_grad_out,
-#ifdef __HIP_PLATFORM_HCC__
-                       rocblas_gemm_algo(config_.gemm_algos[2]));
-#else
-                       cublasGemmAlgo_t(config_.gemm_algos[2]));
-#endif
-        launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
-    }
-private:
-    Config config_;
-};
-#endif
--- a/csrc/includes/gelu_hip.h
+++ b/csrc/includes/gelu_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include "custom_hip_layers.h"
-template <typename T>
-class Gelu {
-public:
-    struct Config {
-        uint32_t intermediate_size;
-        Config(uint32_t inter_size) : intermediate_size(inter_size) {}
-    };
-    Gelu(const Config& config) : _config(config) {}
-    virtual ~Gelu() {}
-    void ForwardWithBiasAdd(int bsz,
-                            const T* input_buf,
-                            const T* bias,
-                            T* output,
-                            hipStream_t stream)
-    {
-        launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
-    }
-    void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, hipStream_t stream)
-    {
-        launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
-    }
-private:
-    Config _config;
-};
--- a/csrc/includes/gemm_test_hip.h
+++ b/csrc/includes/gemm_test_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#include <hip/hip_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
-#include <cuda_profiler_api.h>
-#endif
-#include <array>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <limits>
-#include <memory>
-#include "StopWatch.h"
-#include "cublas_wrappers_hip.h"
-template <typename T>
-void check(T result, char const* const func, const char* const file, int const line)
-{
-    if (result) {
-        std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
-                      " \n");
-    }
-}
-#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
-template <typename T>
-class GemmTest {
-public:
-    GemmTest(int m, int n, int k, rocblas_operation ta, rocblas_operation tb, rocblas_handle h)
-        : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-    {
-        check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K));
-        check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N));
-        check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N));
-    }
-    ~GemmTest()
-    {
-        check_cuda_error(hipFree(A));
-        check_cuda_error(hipFree(B));
-        check_cuda_error(hipFree(C));
-    }
-    std::array<int, 3> TestAlgo(int loops)
-    {
-        float alpha = (T)1.0f;
-        float beta = (T)0.0f;
-        int algo_fw = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           rocblas_operation_transpose,
-                           rocblas_operation_none,
-                           N,
-                           M,
-                           K,
-                           &alpha,
-                           &beta,
-                           B,
-                           A,
-                           C,
-#ifdef __HIP_PLATFORM_HCC__
-                           static_cast<rocblas_gemm_algo>(algo));
-#else
-                           static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-        int algo_bw1 = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           rocblas_operation_none,
-                           rocblas_operation_transpose,
-                           K,
-                           N,
-                           M,
-                           &alpha,
-                           &beta,
-                           A,
-                           C,
-                           B,
-#ifdef __HIP_PLATFORM_HCC__
-                           static_cast<rocblas_gemm_algo>(algo));
-#else
-                           static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-        int algo_bw2 = Run(loops, [=](int algo) {
-            cublas_gemm_ex(handle,
-                           rocblas_operation_none,
-                           rocblas_operation_none,
-                           K,
-                           M,
-                           N,
-                           &alpha,
-                           &beta,
-                           B,
-                           C,
-                           A,
-#ifdef __HIP_PLATFORM_HCC__
-                           static_cast<rocblas_gemm_algo>(algo));
-#else
-                           static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
-    }
-    template <typename Func>
-    int Run(int loops, Func f)
-    {
-        float fast_latency = (std::numeric_limits<float>::max)();
-        int fast_algo = 0;
-#ifdef __HIP_PLATFORM_HCC__
-        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
-#else
-        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-#endif
-             algo++) {
-            int warm_up = 5;
-            for (int i = 0; i < warm_up; ++i) f(algo);
-            hipDeviceSynchronize();
-            Stopwatch timer;
-            timer.Restart();
-            for (int i = 0; i < loops; ++i) f(algo);
-            hipDeviceSynchronize();
-            timer.Stop();
-            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
-            printf("algo-%d: %.3fms\n", algo, avg_latency);
-            if (avg_latency < fast_latency) {
-                fast_latency = avg_latency;
-                fast_algo = algo;
-            }
-        }
-        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
-        return fast_algo;
-    }
-private:
-    int M, N, K;
-    rocblas_handle handle;
-    rocblas_operation transa, transb;
-    T *A, *B, *C;
-};
-template <typename T>
-class StridedGemmTest {
-public:
-    StridedGemmTest(int b,
-                    int m,
-                    int n,
-                    int k,
-                    rocblas_operation ta,
-                    rocblas_operation tb,
-                    rocblas_handle h)
-        : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
-    {
-        check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K * bsz));
-        check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N * bsz));
-        check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N * bsz));
-    }
-    ~StridedGemmTest()
-    {
-        check_cuda_error(hipFree(A));
-        check_cuda_error(hipFree(B));
-        check_cuda_error(hipFree(C));
-    }
-    std::array<int, 3> TestAlgo(int loops)
-    {
-        float alpha = (T)1.0f;
-        float beta = (T)0.0f;
-        int algo_fw = Run(loops, [=](int algo) {
-            int stride_a = M * K;
-            int stride_b = N * K;
-            int stride_c = M * N;
-            cublas_strided_batched_gemm(handle,
-                                        M,
-                                        N,
-                                        K,
-                                        &alpha,
-                                        &beta,
-                                        A,
-                                        B,
-                                        C,
-                                        transa,
-                                        transb,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                        static_cast<rocblas_gemm_algo>(algo));
-#else
-                                        static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-        int algo_bw1 = Run(loops, [=](int algo) {
-            int mb = (transa == rocblas_operation_transpose ? K : M);
-            int kb = (transa == rocblas_operation_transpose ? M : K);
-            int stride_a = mb * N;
-            int stride_b = N * kb;
-            int stride_c = M * K;
-            // B need to transpose.
-            rocblas_operation op_b = (transb == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
-            // Calculate d_A.
-            cublas_strided_batched_gemm(handle,
-                                        mb,
-                                        kb,
-                                        N,
-                                        &alpha,
-                                        &beta,
-                                        (transa == rocblas_operation_transpose ? B : C),
-                                        (transa == rocblas_operation_transpose ? C : B),
-                                        A,
-                                        rocblas_operation_none,
-                                        op_b,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                        static_cast<rocblas_gemm_algo>(algo));
-#else
-                                        static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-        int algo_bw2 = Run(loops, [=](int algo) {
-            // A need to transpose.
-            rocblas_operation op_a = (transa == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
-            int stride_a = M * K;
-            int stride_b = M * N;
-            int stride_c = N * K;
-            // Calculate d_B.
-            cublas_strided_batched_gemm(handle,
-                                        K,
-                                        N,
-                                        M,
-                                        &alpha,
-                                        &beta,
-                                        A,
-                                        C,
-                                        B,
-                                        op_a,
-                                        rocblas_operation_none,
-                                        stride_a,
-                                        stride_b,
-                                        stride_c,
-                                        bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                        static_cast<rocblas_gemm_algo>(algo));
-#else
-                                        static_cast<cublasGemmAlgo_t>(algo));
-#endif
-        });
-        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
-    }
-    template <typename Func>
-    int Run(int loops, Func f)
-    {
-        float fast_latency = (std::numeric_limits<float>::max)();
-        int fast_algo = 0;
-#ifdef __HIP_PLATFORM_HCC__
-        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
-#else
-        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-#endif
-             algo++) {
-            int warm_up = 5;
-            for (int i = 0; i < warm_up; ++i) f(algo);
-            hipDeviceSynchronize();
-            Stopwatch timer;
-            timer.Restart();
-            for (int i = 0; i < loops; ++i) f(algo);
-            hipDeviceSynchronize();
-            timer.Stop();
-            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
-            printf("algo-%d: %.3fms\n", algo, avg_latency);
-            if (avg_latency < fast_latency) {
-                fast_latency = avg_latency;
-                fast_algo = algo;
-            }
-        }
-        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
-        return fast_algo;
-    }
-private:
-    int bsz, M, N, K;
-    rocblas_handle handle;
-    rocblas_operation transa, transb;
-    T *A, *B, *C;
-};
--- a/csrc/includes/general_kernels_hip.h
+++ b/csrc/includes/general_kernels_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __HIP_PLATFORM_HCC__
-#include <hip/hip_cooperative_groups.h>
-#else
-#include <cooperative_groups.h>
-#endif
-#include <hiprand/hiprand_kernel.h>
-#include "context_hip.h"
-#include "cublas_wrappers_hip.h"
-#define THREADS 256
-#define TILE_DIM 32
-#define minus_infinity -1 * std::numeric_limits<float>::infinity()
-#define FINAL_MASK 0xffffffff
-template <typename T>
-void launch_fused_add2(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       hipStream_t& stream);
-template <typename T>
-void launch_fused_add4(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       const T* inp3,
-                       const T* inp4,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       hipStream_t& stream);
-template <typename T>
-void launch_fused_add3(T* out,
-                       const T* inp1,
-                       const T* inp2,
-                       const T* inp3,
-                       int batch_size,
-                       int seq_length,
-                       int hidden_size,
-                       hipStream_t& stream);
--- a/csrc/includes/normalize_layer_hip.h
+++ b/csrc/includes/normalize_layer_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include <fstream>
-#include "custom_hip_layers.h"
-using namespace std;
-template <typename T>
-class Normalize_Layer {
-public:
-    struct Config {
-        uint32_t batchSize;
-        uint32_t seqLength;
-        uint32_t hiddenDim;
-        float epsilon;
-        bool training;
-        bool useMean;
-        Config(uint32_t batch,
-               uint32_t seq,
-               uint32_t h,
-               float epsilon = 1e-12,
-               bool training = true,
-               bool useMean = true)
-            : batchSize(batch),
-              seqLength(seq),
-              hiddenDim(h),
-              epsilon(epsilon),
-              training(training),
-              useMean(useMean)
-        {
-        }
-    };
-    Normalize_Layer(Config config)
-        : config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr)
-    {
-    }
-    ~Normalize_Layer() {}
-    void ForwardCheckpoint(int bsz,  // batch * seq
-                           T* vals,
-                           const T* residual,
-                           const T* gamma,
-                           const T* betta,
-                           hipStream_t& stream,
-                           bool preLayerNorm = false)
-    {
-        launch_bias_residual_layer_norm(vals,
-                                        residual,
-                                        gamma,
-                                        betta,
-                                        config_.epsilon,
-                                        bsz,
-                                        config_.hiddenDim,
-                                        stream,
-                                        preLayerNorm,
-                                        config_.training,
-                                        vars,
-                                        means);
-    }
-    void Forward(int bsz,
-                 T* vals,
-                 const T* residual,
-                 const T* gamma,
-                 const T* betta,
-                 hipStream_t& stream,
-                 bool preLayerNorm = false)
-    {
-        launch_bias_residual_layer_norm(vals,
-                                        residual,
-                                        gamma,
-                                        betta,
-                                        config_.epsilon,
-                                        bsz,
-                                        config_.hiddenDim,
-                                        stream,
-                                        preLayerNorm,
-                                        config_.training,
-                                        vars);
-    }
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* gamma,
-                  T* gamma_grad,
-                  T* betta_grad,
-                  hipStream_t stream[2],
-                  T* inp_grad_out,
-                  const T* norm_in = nullptr)
-    {
-        launch_layerNorm_backward(out_grad,
-                                  norm_in,
-                                  vars,
-                                  means,
-                                  gamma,
-                                  gamma_grad,
-                                  betta_grad,
-                                  inp_grad_out,
-                                  bsz,
-                                  config_.hiddenDim,
-                                  stream);
-    }
-    void Backward(int bsz,
-                  const T* out_grad,
-                  const T* gamma,
-                  const T* betta,
-                  T* gamma_grad,
-                  T* betta_grad,
-                  hipStream_t stream[2],
-                  T* inp_grad_out,
-                  const T* norm_out)
-    {
-        launch_layerNorm_backward(out_grad,
-                                  norm_out,
-                                  vars,
-                                  gamma,
-                                  gamma_grad,
-                                  betta_grad,
-                                  inp_grad_out,
-                                  bsz,
-                                  config_.hiddenDim,
-                                  stream,
-                                  !config_.useMean,
-                                  betta);
-    }
-    void BackwardFusedAdd(int bsz,
-                          const T* out_grad1,
-                          const T* out_grad2,
-                          const T* gamma,
-                          T* gamma_grad,
-                          T* betta_grad,
-                          hipStream_t stream[2],
-                          T* inp_grad_out,
-                          const T* norm_in = nullptr)
-    {
-        launch_layerNorm_backward_fused_add(out_grad1,
-                                            out_grad2,
-                                            norm_in,
-                                            vars,
-                                            means,
-                                            gamma,
-                                            gamma_grad,
-                                            betta_grad,
-                                            inp_grad_out,
-                                            bsz,
-                                            config_.hiddenDim,
-                                            stream);
-    }
-    void BackwardFusedAdd(int bsz,
-                          const T* out_grad1,
-                          const T* out_grad2,
-                          const T* gamma,
-                          const T* betta,
-                          T* gamma_grad,
-                          T* betta_grad,
-                          hipStream_t stream[2],
-                          T* inp_grad_out,
-                          const T* norm_out)
-    {
-        launch_layerNorm_backward_fused_add(out_grad1,
-                                            out_grad2,
-                                            norm_out,
-                                            vars,
-                                            gamma,
-                                            gamma_grad,
-                                            betta_grad,
-                                            inp_grad_out,
-                                            bsz,
-                                            config_.hiddenDim,
-                                            stream,
-                                            !config_.useMean,
-                                            betta);
-    }
-    inline bool UseMean() const { return config_.useMean; }
-    inline void SetVar(T* variance)
-    {
-        if (!variance) { throw std::runtime_error("Normalize variance is null."); }
-        vars = variance;
-    }
-    inline void SetMean(T* mean)
-    {
-        if (!mean) { throw std::runtime_error("Normalize mean is null."); }
-        means = mean;
-    }
-private:
-    Config config_;
-    T* vars;
-    T* means;
-    T* vals_hat;
-};
--- a/csrc/includes/quantizer_hip.h
+++ b/csrc/includes/quantizer_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#include <cooperative_groups.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cassert>
-#include <iostream>