delete hip file

4acf0e01 · aiss · 7dd68788 · 7dd68788 · 7dd68788 · 7dd68788
Commit 4acf0e01 authored Apr 26, 2023 by aiss
20 changed files
--- a/csrc/includes/softmax_hip.h
+++ b/csrc/includes/softmax_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include "custom_hip_layers.h"
-
-#include <fstream>
-
-using namespace std;
-
-template <typename T>
-class Softmax {
-public:
-    struct Config {
-        size_t batchSize;
-        size_t heads;
-        size_t seq_length;
-        size_t prob_depth;
-        float temperature;
-        bool mem_alloc;
-        Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
-            : batchSize(batch),
-              heads(h),
-              seq_length(seq),
-              prob_depth(prob_size),
-              temperature(1.0),
-              mem_alloc(mem_alloc)
-        {
-        }
-    };
-
-    Softmax(Config config) : config_(config) {}
-
-    ~Softmax() {}
-
-    void Forward(int bsz, T* vals, const T* attn_mask, hipStream_t& stream)
-    {
-        launch_attn_softmax<T>(vals, attn_mask, bsz, config_.heads, config_.seq_length, stream);
-    }
-
-    void Backward(int bsz, T* out_grad, const T* soft_out, hipStream_t stream)
-    {
-        launch_attn_softmax_backward_v2<T>(
-            out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream);
-    }
-
-    inline size_t GetProbDepth() const { return config_.prob_depth; }
-
-    inline size_t GetBatchSize() const { return config_.batchSize; }
-
-    inline size_t GetNumHeads() const { return config_.heads; }
-
-    inline size_t GetSeqLength() const { return config_.seq_length; }
-
-    inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; }
-
-private:
-    Config config_;
-};
--- a/csrc/includes/strided_batch_gemm_hip.h
+++ b/csrc/includes/strided_batch_gemm_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <stdio.h>
-#include "context_hip.h"
-
-template <typename T>
-class StridedBatchGemm {
-public:
-    struct Config {
-        int batch_size;
-        int m;
-        int n;
-        int k;
-        float alpha;
-        float beta;
-        rocblas_operation op_A;
-        rocblas_operation op_B;
-        std::array<int, 3> gemm_algos;
-
-        Config(int batch,
-               int mm,
-               int nn,
-               int kk,
-               float param_alpha,
-               float param_beta,
-               rocblas_operation opA,
-               rocblas_operation opB,
-               const std::array<int, 3>& algos)
-            : batch_size(batch),
-              m(mm),
-              n(nn),
-              k(kk),
-              alpha(param_alpha),
-              beta(param_beta),
-              op_A(opA),
-              op_B(opB),
-              gemm_algos(algos)
-        {
-        }
-        void SetConfig(int mm, int nn, int kk)
-        {
-            m = mm;
-            n = nn;
-            k = kk;
-        }
-    };
-
-    StridedBatchGemm(const Config& config) : _config(config) {}
-
-    virtual ~StridedBatchGemm() {}
-
-    void Forward(int bsz, T* output, const T* _buffer_a, const T* _buffer_b, rocblas_handle handle)
-    {
-        int stride_a = _config.m * _config.k;
-        int stride_b = _config.n * _config.k;
-        int stride_c = _config.m * _config.n;
-
-        cublas_strided_batched_gemm(handle,
-                                    _config.m,
-                                    _config.n,
-                                    _config.k,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    _buffer_b,
-                                    output,
-                                    _config.op_A,
-                                    _config.op_B,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[0]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
-#endif
-    }
-
-    void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, rocblas_handle handle)
-    {
-        int stride_a = _config.m * _config.k;
-        int stride_b = _config.n * _config.k;
-        int stride_c = _config.m * _config.n;
-
-        cublas_strided_batched_gemm(handle,
-                                    _config.m,
-                                    _config.n,
-                                    _config.k,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    _buffer_b,
-                                    output,
-                                    _config.op_A,
-                                    _config.op_B,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    _config.batch_size,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[0]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
-#endif
-
-        k_buf = _buffer_a;
-        q_buf = _buffer_b;
-    }
-
-    void Backward(int bsz,
-                  const T* d_output,
-                  const T* _buffer_a,
-                  const T* _buffer_b,
-                  rocblas_handle handle,
-                  T* inpGradA = nullptr,
-                  T* inpGradB = nullptr)
-    {
-        int mb = (_config.op_A == rocblas_operation_transpose ? _config.k : _config.m);
-        int kb = (_config.op_A == rocblas_operation_transpose ? _config.m : _config.k);
-
-        int stride_a = mb * _config.n;
-        int stride_b = _config.n * kb;
-        int stride_c = _config.m * _config.k;
-
-        // B need to transpose.
-        rocblas_operation op_b = (_config.op_B == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
-
-        // Calculate d_A.
-        cublas_strided_batched_gemm(handle,
-                                    mb,
-                                    kb,
-                                    _config.n,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    (_config.op_A == rocblas_operation_transpose ? _buffer_b : d_output),
-                                    (_config.op_A == rocblas_operation_transpose ? d_output : _buffer_b),
-                                    inpGradA,
-                                    rocblas_operation_none,
-                                    op_b,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[1]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[1]));
-#endif
-
-        // A need to transpose.
-        rocblas_operation op_a = (_config.op_A == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
-
-        stride_a = _config.m * _config.k;
-        stride_b = _config.m * _config.n;
-        stride_c = _config.n * _config.k;
-
-        // Calculate d_B.
-        cublas_strided_batched_gemm(handle,
-                                    _config.k,
-                                    _config.n,
-                                    _config.m,
-                                    &_config.alpha,
-                                    &_config.beta,
-                                    _buffer_a,
-                                    d_output,
-                                    inpGradB,
-                                    op_a,
-                                    rocblas_operation_none,
-                                    stride_a,
-                                    stride_b,
-                                    stride_c,
-                                    bsz,
-#ifdef __HIP_PLATFORM_HCC__
-                                    rocblas_gemm_algo(_config.gemm_algos[2]));
-#else
-                                    cublasGemmAlgo_t(_config.gemm_algos[2]));
-#endif
-    }
-
-    inline int GetN() const { return _config.k; }
-
-    inline const T* GetBufferA() const { return k_buf; }
-
-    inline const T* GetBufferB() const { return q_buf; }
-
-    inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
-
-private:
-    Config _config;
-    const T* q_buf;
-    const T* k_buf;
-};
--- a/csrc/includes/type_shim_hip.h
+++ b/csrc/includes/type_shim_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
-#include <ATen/ATen.h>
-
-// Forward/backward compatibility hack around
-// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
-// pending more future-proof guidance from upstream.
-// struct TypeShim
-// {
-//   const at::Type& payload;
-//   TypeShim(const at::Type& type) : payload(type) {}
-//   // Enable trivial conversion to a const at::Type& for pre-3aeb78
-//   operator const at::Type&(){ return payload; };
-//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
-//   //operator at::ScalarType(){ return payload.; };
-// };
-
-#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                          \
-    switch (TYPE) {                                                              \
-        case at::ScalarType::Float: {                                            \
-            using scalar_t_##LEVEL = float;                                      \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Half: {                                             \
-            using scalar_t_##LEVEL = at::Half;                                   \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::BFloat16: {                                         \
-            using scalar_t_##LEVEL = at::BFloat16;                               \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
-
-#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                   \
-    switch (TYPE) {                                                              \
-        case at::ScalarType::Double: {                                           \
-            using scalar_t_##LEVEL = double;                                     \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Float: {                                            \
-            using scalar_t_##LEVEL = float;                                      \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Half: {                                             \
-            using scalar_t_##LEVEL = at::Half;                                   \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::BFloat16: {                                         \
-            using scalar_t_##LEVEL = at::BFloat16;                               \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
-
-#define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)                        \
-    switch (TYPE) {                                                              \
-        case at::ScalarType::Double: {                                           \
-            using scalar_t_##LEVEL = double;                                     \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        case at::ScalarType::Float: {                                            \
-            using scalar_t_##LEVEL = float;                                      \
-            __VA_ARGS__;                                                         \
-            break;                                                               \
-        }                                                                        \
-        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
-    }
-
-template <typename T>
-__device__ __forceinline__ T
-reduce_block_into_lanes(T* x,
-                        T val,
-                        int lanes = 1,
-                        bool share_result = false)  // lanes is intended to be <= 32.
-{
-    int tid = threadIdx.x + threadIdx.y * blockDim.x;
-    int blockSize = blockDim.x * blockDim.y;  // blockSize is intended to be a multiple of 32.
-
-    if (blockSize >= 64) {
-        x[tid] = val;
-        __syncthreads();
-    }
-
-#pragma unroll
-    for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
-        if (tid < i) x[tid] = x[tid] + x[tid + i];
-        __syncthreads();
-    }
-
-    T final;
-
-    if (tid < 32) {
-        if (blockSize >= 64)
-            final = x[tid] + x[tid + 32];
-        else
-            final = val;
-            // __SYNCWARP();
-
-#pragma unroll
-        for (int i = 16; i >= lanes; i >>= 1)
-            final = final + __shfl_down_sync(0xffffffff, final, i);
-    }
-
-    if (share_result) {
-        if (tid < lanes) x[tid] = final;  // EpilogueOp
-        // Make sure the smem result is visible to all warps.
-        __syncthreads();
-    }
-
-    return final;
-}
--- a/csrc/lamb/fused_lamb_hip_kernel.hip
+++ b/csrc/lamb/fused_lamb_hip_kernel.hip
-// !!! This is a file automatically generated by hipify!!!
-/* Copyright 2019 The Microsoft DeepSpeed Team */
-#include <hip/hip_runtime.h>
-#include <hip/hip_runtime.h>
-#include <stdio.h>
-#include <cmath>
-#include "ATen/ATen.h"
-#include "ATen/TensorUtils.h"
-#include "ATen/hip/HIPContext.h"
-#include "ATen/hip/detail/IndexUtils.cuh"
-//#include "ATen/Type.h"
-#include "ATen/AccumulateType.h"
-
-#include <iostream>
-
-//#include <helper_functions.h>
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-#include <hip/hip_cooperative_groups.h>
-#else
-#include <cooperative_groups.h>
-#endif
-#include <hip/hip_runtime_api.h>
-#include <stdio.h>
-
-namespace cg = cooperative_groups;
-
-// Utility class used to avoid linker errors with extern
-// unsized shared memory arrays with templated type
-namespace {
-// This is the un-specialized struct.  Note that we prevent instantiation of this
-// struct by putting an undefined symbol in the function body so it won't compile.
-template <typename T>
-struct SharedMemory {
-    // Ensure that we won't compile any un-specialized types
-    __device__ inline operator T*()
-    {
-#ifndef _WIN32
-        extern __device__ void error(void);
-        error();
-#endif
-        return NULL;
-    }
-};
-
-template <>
-struct SharedMemory<float> {
-    __device__ inline operator float*()
-    {
-        HIP_DYNAMIC_SHARED( float, s_float)
-        return s_float;
-    }
-};
-
-template <>
-struct SharedMemory<double> {
-    __device__ inline operator double*()
-    {
-        HIP_DYNAMIC_SHARED( double, s_double)
-        return s_double;
-    }
-};
-}  // namespace
-
-#include "type_shim_hip.h"
-
-typedef enum {
-    ADAM_MODE_0 = 0,  // eps under square root
-    ADAM_MODE_1 = 1   // eps outside square root
-} adamMode_t;
-
-// s_a and s_b are in shared memory
-// g_a and g_b are in shared memory
-template <typename T, int blockSize>
-__device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
-{
-    // Handle to thread block group
-    cg::thread_block cta = cg::this_thread_block();
-
-    // perform block reduction in shared memory,
-    unsigned int tid = cta.thread_rank();
-
-    T a_sum = s_a[tid];
-    T b_sum = s_b[tid];
-
-    cg::sync(cta);
-
-    // do reduction in shared mem
-    if ((blockSize >= 512) && (tid < 256)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 256];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 256];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 256) && (tid < 128)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 128];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 128];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 128) && (tid < 64)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 64];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 64];
-    }
-
-    cg::sync(cta);
-
-#if (__CUDA_ARCH__ >= 300)
-    if (tid < 32) {
-        cg::coalesced_group active = cg::coalesced_threads();
-
-        // Fetch final intermediate sum from 2nd warp
-        if (blockSize >= 64) {
-            a_sum = a_sum + s_a[tid + 32];
-            b_sum = b_sum + s_b[tid + 32];
-        }
-
-        // Reduce final warp using shuffle
-        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-            a_sum += active.shfl_down(a_sum, offset);
-            b_sum += active.shfl_down(b_sum, offset);
-        }
-    }
-#else
-    if ((blockSize >= 64) && (tid < 32)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 32];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 32];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 32) && (tid < 16)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 16];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 16];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 16) && (tid < 8)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 8];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 8];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 8) && (tid < 4)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 4];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 4];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 4) && (tid < 2)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 2];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 2];
-    }
-
-    cg::sync(cta);
-
-    if ((blockSize >= 2) && (tid < 1)) {
-        s_a[tid] = a_sum = a_sum + s_a[tid + 1];
-        s_b[tid] = b_sum = b_sum + s_b[tid + 1];
-    }
-
-    cg::sync(cta);
-
-#endif
-
-    // write result for this block to global mem
-    if (tid == 0) {
-        g_a[blockIdx.x] = (T)a_sum;
-        g_b[blockIdx.x] = (T)b_sum;
-    }
-}
-
-template <typename T, int blockSize>
-__device__ void reduce_two_vectors_in_register(T a, T b, T* g_a, T* g_b)
-{
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-
-    T* s_a = SharedMemory<T>();
-    T* s_b = SharedMemory<T>() + cg::this_thread_block().size();
-
-    s_a[threadIdInBlock] = a;
-    s_b[threadIdInBlock] = b;
-
-    reduce_block_in_shared_memory<T, blockSize>(s_a, s_b, g_a, g_b);
-}
-
-template <typename T, typename GRAD_T, int blockSize>
-__global__ void lamb_cuda_kernel_part1(
-    T* __restrict__ p,
-    GRAD_T* __restrict__ p_copy,  // For mixed precision training, pass NULL if not needed
-    T* __restrict__ m,
-    T* __restrict__ v,
-    const GRAD_T* __restrict__ g,
-    const float b1,
-    const float b2,
-    const float eps,
-    const float grad_scale,
-    const float step_size,
-    const size_t tsize,
-    adamMode_t mode,
-    const float decay,
-    T* __restrict__ w_l2_i,
-    T* __restrict__ u_l2_i)
-{
-    // Assuming 2D grids and 2D blocks
-    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
-    const int threadsPerBlock = blockDim.x * blockDim.y;
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-    const int i = (blockId * threadsPerBlock + threadIdInBlock);
-    const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;
-
-    T reg_w = 0;
-    T reg_u = 0;
-
-    for (int j = i; j < tsize; j += totThreads) {
-        T scaled_grad = g[j] / grad_scale;
-        T pj = p[j];
-        m[j] = b1 * m[j] + (1 - b1) * scaled_grad;
-        v[j] = b2 * v[j] + (1 - b2) * scaled_grad * scaled_grad;
-        float denom;
-        if (mode == ADAM_MODE_0)
-            denom = sqrtf(v[j] + eps);
-        else  // Mode 1
-            denom = sqrtf(v[j]) + eps;
-        T update = (m[j] / denom) + (decay * p[j]);
-
-        reg_u += update * update;
-        reg_w += pj * pj;
-    }
-
-    reduce_two_vectors_in_register<T, blockSize>(reg_w, reg_u, w_l2_i, u_l2_i);
-}
-
-template <typename T, typename GRAD_T, int blockSize>
-__global__ void lamb_cuda_kernel_part2(const size_t tsize, T* __restrict__ g_a, T* __restrict__ g_b)
-{
-    T* s_a = SharedMemory<T>();
-    T* s_b = SharedMemory<T>() + cg::this_thread_block().size();
-
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-
-    s_a[threadIdInBlock] = g_a[threadIdInBlock];
-    s_b[threadIdInBlock] = g_b[threadIdInBlock];
-
-    if (threadIdInBlock >= tsize) {
-        s_a[threadIdInBlock] = 0.0;
-        s_b[threadIdInBlock] = 0.0;
-    }
-
-    reduce_block_in_shared_memory<T, blockSize>(s_a, s_b, g_a, g_b);
-}
-
-template <typename T, typename GRAD_T>
-__global__ void lamb_cuda_kernel_part3(
-    T* __restrict__ p,
-    GRAD_T* __restrict__ p_copy,  // For mixed precision training, pass NULL if not needed
-    T* __restrict__ m,
-    T* __restrict__ v,
-    const GRAD_T* __restrict__ g,
-    const float b1,
-    const float b2,
-    const float max_coeff,
-    const float min_coeff,
-    const float eps,
-    const float grad_scale,
-    const float step_size,
-    const size_t tsize,
-    adamMode_t mode,
-    const float decay,
-    T* __restrict__ w_l2_i,
-    T* __restrict__ u_l2_i,
-    T* __restrict__ lamb_coeff_val)
-{
-    // Assuming 2D grids and 2D blocks
-    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
-    const int threadsPerBlock = blockDim.x * blockDim.y;
-    const int threadIdInBlock = cg::this_thread_block().thread_rank();
-    const int i = (blockId * threadsPerBlock + threadIdInBlock);
-    const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;
-
-    T reg_w = sqrtf(w_l2_i[0]);
-    T reg_u = sqrtf(u_l2_i[0]);
-
-    float lamb_coeff = 1.0;
-
-    if (reg_w != 0 && reg_u != 0) {
-        lamb_coeff = reg_w / reg_u;
-        if (lamb_coeff > max_coeff) { lamb_coeff = max_coeff; }
-        if (lamb_coeff < min_coeff) { lamb_coeff = min_coeff; }
-    }
-
-    if (blockId == 0 && threadIdInBlock == 0) {
-        lamb_coeff_val[0] = lamb_coeff;
-        // printf("Cuda Lamb Coeff is %.6f \n",lamb_coeff);
-    }
-
-    for (int j = i; j < tsize; j += totThreads) {
-        T pj = (float)p[j];
-        T mj = m[j];
-        T vj = v[j];
-        float denom;
-        if (mode == ADAM_MODE_0)
-            denom = sqrtf(vj + eps);
-        else  // Mode 1
-            denom = sqrtf(vj) + eps;
-        T update = (mj / denom) + (decay * pj);
-
-        pj = pj - (step_size * lamb_coeff * update);
-        p[j] = pj;
-        if (p_copy != NULL) p_copy[j] = (GRAD_T)pj;
-    }
-}
-
-void fused_lamb_cuda(at::Tensor& p,
-                     at::Tensor& p_copy,
-                     at::Tensor& m,
-                     at::Tensor& v,
-                     at::Tensor& g,
-                     float lr,
-                     float beta1,
-                     float beta2,
-                     float max_coeff,
-                     float min_coeff,
-                     float eps,
-                     float grad_scale,
-                     int step,
-                     int mode,
-                     int bias_correction,
-                     float decay,
-                     at::Tensor& w_l2_i,
-                     at::Tensor& u_l2_i,
-                     at::Tensor& lamb_coeff)
-{
-    //        using namespace at;
-
-    // Get tensor size
-    int tsize = p.numel();
-    // Determine #threads and #blocks
-    const int threadsPerBlock = 512;
-    int num_blocks = (tsize + threadsPerBlock - 1) / threadsPerBlock;
-    if (num_blocks > 512) num_blocks = 512;
-
-    int smemsize = 0;
-
-    if (p.type().scalarType() == at::ScalarType::Double)
-        smemsize = 2 * threadsPerBlock * sizeof(double);
-    else
-        smemsize = 2 * threadsPerBlock * sizeof(float);
-
-    const dim3 blocks(num_blocks);
-    const dim3 threads(threadsPerBlock);
-
-    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p),
-               "parameter tensor is too large to be indexed with int32");
-    // Constants
-    float step_size = 0;
-    if (bias_correction == 1) {
-        const float bias_correction1 = 1 - ::pow(beta1, step);
-        const float bias_correction2 = 1 - ::pow(beta2, step);
-        step_size = lr * std::sqrt(bias_correction2) / bias_correction1;
-    } else {
-        step_size = lr;
-    }
-    hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-
-    if (g.type().scalarType() == at::ScalarType::Half) {
-        // all other values should be fp32 for half gradients
-        AT_ASSERTM(p.type().scalarType() == at::ScalarType::Float,
-                   "expected parameter to be of float type");
-        // dispatch is done on the gradient type
-        using namespace at;  // prevents "toString is undefined" errors
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-            g.scalar_type(), "lamb_cuda_kernel", ([&] {
-                using accscalar_t = at::acc_type<scalar_t, true>;
-
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part1<accscalar_t, scalar_t, threadsPerBlock>)
-                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
-                        p.data<accscalar_t>(),
-                        p_copy.numel() ? p_copy.data<scalar_t>() : NULL,
-                        m.data<accscalar_t>(),
-                        v.data<accscalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<accscalar_t>(),
-                        u_l2_i.data<accscalar_t>());
-
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part2<accscalar_t, scalar_t, threadsPerBlock>)
-                    , dim3(1), dim3(threadsPerBlock), smemsize, stream, 
-                        num_blocks, w_l2_i.data<accscalar_t>(), u_l2_i.data<accscalar_t>());
-
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part3<accscalar_t, scalar_t>)
-                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
-                        p.data<accscalar_t>(),
-                        p_copy.numel() ? p_copy.data<scalar_t>() : NULL,
-                        m.data<accscalar_t>(),
-                        v.data<accscalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        max_coeff,
-                        min_coeff,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<accscalar_t>(),
-                        u_l2_i.data<accscalar_t>(),
-                        lamb_coeff.data<accscalar_t>());
-            }));
-    } else {
-        using namespace at;
-        AT_DISPATCH_FLOATING_TYPES(
-            g.scalar_type(), "lamb_cuda_kernel", ([&] {
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part1<scalar_t, scalar_t, threadsPerBlock>)
-                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
-                        p.data<scalar_t>(),
-                        NULL,  // don't output p_copy for fp32, it's wasted write
-                        m.data<scalar_t>(),
-                        v.data<scalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<scalar_t>(),
-                        u_l2_i.data<scalar_t>());
-
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part2<scalar_t, scalar_t, threadsPerBlock>)
-                    , dim3(1), dim3(threadsPerBlock), smemsize, stream, 
-                        num_blocks, w_l2_i.data<scalar_t>(), u_l2_i.data<scalar_t>());
-
-               hipLaunchKernelGGL(( lamb_cuda_kernel_part3<scalar_t, scalar_t>)
-                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
-                        p.data<scalar_t>(),
-                        NULL,  // don't output p_copy for fp32, it's wasted write
-                        m.data<scalar_t>(),
-                        v.data<scalar_t>(),
-                        g.data<scalar_t>(),
-                        beta1,
-                        beta2,
-                        max_coeff,
-                        min_coeff,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t)mode,
-                        decay,
-                        w_l2_i.data<scalar_t>(),
-                        u_l2_i.data<scalar_t>(),
-                        lamb_coeff.data<scalar_t>());
-            }));
-    }
-    C10_HIP_CHECK(hipGetLastError());
-}
-
-// template __device__ void reduce_two_vectors_in_register<float,512>(float a, float b, float* g_a,
-// float* g_b, cg::grid_group &cgg);
--- a/csrc/quantization/pt_binding_hip.cpp
+++ b/csrc/quantization/pt_binding_hip.cpp
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/hip/HIPContext.h>
-#include <torch/extension.h>
-#include <vector>
-#include "custom_hip_layers.h"
-
-template <typename T>
-at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_quantize_kernel(
-            (T*)vals.data_ptr(), size, groups, bits, at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    }
-    return vals;
-}
-
-template <typename T>
-at::Tensor ds_sr_quantize(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_quantize_kernel(
-            (T*)vals.data_ptr(), size, groups, bits, at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    }
-    return vals;
-}
-
-template <typename T>
-at::Tensor ds_quantize_asym(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_quantize_kernel_asym(
-            (T*)vals.data_ptr(), size, groups, bits, at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    }
-    return vals;
-}
-
-template <typename T>
-at::Tensor ds_sr_quantize_asym(at::Tensor& vals, int groups, int bits)
-{
-    auto t_size = vals.sizes();
-    int size = 1;
-    for (auto dim : t_size) size *= dim;
-
-    if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_quantize_kernel_asym(
-            (T*)vals.data_ptr(), size, groups, bits, at::hip::getCurrentHIPStreamMasqueradingAsCUDA());
-    }
-    return vals;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("ds_quantize_fp32", &ds_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def("ds_quantize_fp16", &ds_quantize<__half>, "DeepSpeed Quantize with fp16 (CUDA)");
-    m.def("ds_sr_quantize_fp32", &ds_sr_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def("ds_sr_quantize_fp16", &ds_sr_quantize<__half>, "DeepSpeed Quantize with fp16 (CUDA)");
-    m.def("ds_quantize_asym_fp32", &ds_quantize_asym<float>, "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def(
-        "ds_quantize_asym_fp16", &ds_quantize_asym<__half>, "DeepSpeed Quantize with fp16 (CUDA)");
-    m.def("ds_sr_quantize_asym_fp32",
-          &ds_sr_quantize_asym<float>,
-          "DeepSpeed Quantize with fp32 (CUDA)");
-    m.def("ds_sr_quantize_asym_fp16",
-          &ds_sr_quantize_asym<__half>,
-          "DeepSpeed Quantize with fp16 (CUDA)");
-}
--- a/csrc/quantization/quantizer.hip
+++ b/csrc/quantization/quantizer.hip
--- a/csrc/transformer/cublas_wrappers.hip
+++ b/csrc/transformer/cublas_wrappers.hip
--- a/csrc/transformer/dropout_kernels.hip
+++ b/csrc/transformer/dropout_kernels.hip
--- a/csrc/transformer/ds_transformer_hip.cpp
+++ b/csrc/transformer/ds_transformer_hip.cpp
--- a/csrc/transformer/gelu_kernels.hip
+++ b/csrc/transformer/gelu_kernels.hip
--- a/csrc/transformer/general_kernels.hip
+++ b/csrc/transformer/general_kernels.hip
--- a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.hip
+++ b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.hip
--- a/csrc/transformer/inference/csrc/dequantize.hip
+++ b/csrc/transformer/inference/csrc/dequantize.hip
--- a/csrc/transformer/inference/csrc/gelu.hip
+++ b/csrc/transformer/inference/csrc/gelu.hip
--- a/csrc/transformer/inference/csrc/layer_norm.hip
+++ b/csrc/transformer/inference/csrc/layer_norm.hip
--- a/csrc/transformer/inference/csrc/normalize.hip
+++ b/csrc/transformer/inference/csrc/normalize.hip
--- a/csrc/transformer/inference/csrc/pt_binding_hip.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding_hip.cpp
--- a/csrc/transformer/inference/csrc/relu.hip
+++ b/csrc/transformer/inference/csrc/relu.hip
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
-
-#include "conversion_utils.h"
-#include "inference_cuda_layers.h"
-#include "memory_access_utils.h"
-
-namespace cg = cooperative_groups;
-#define MAX_CAP 4
-#define MAX_SEQ 2048
-
-inline __device__ float relu(const float x) { return x < 0 ? 0 : x; }
-
-/*
-In-place relu(biasAdd(x)) for channels last
-*/
-template <typename T>
-__global__ void fused_bias_relu(T* input, const T* bias, int total_count, int intermediate_size)
-{
-    // Input restriction: intermediate_size % vals_per_access == 0
-    constexpr int granularity = 16;
-    constexpr int values_per_access = granularity / sizeof(T);
-    const int offset = (blockIdx.x * blockDim.x + threadIdx.x) * values_per_access;
-
-    if (offset < total_count) {
-        T data[values_per_access];
-        T data_bias[values_per_access];
-        mem_access::load_global<granularity>(data, input + offset);
-        mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
-
-#pragma unroll
-        for (int i = 0; i < values_per_access; i++) {
-            float data_f = conversion::to<float>(data[i]);
-            float bias_f = conversion::to<float>(data_bias[i]);
-            data[i] = conversion::to<T>(relu(data_f + bias_f));
-        }
-
-        mem_access::store_global<granularity>(input + offset, data);
-    }
-}
-
-template <typename T>
-void launch_bias_relu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      hipStream_t stream)
-{
-    constexpr int threads = 1024;
-    constexpr int granularity = 16;
-
-    const int total_count = batch_size * intermediate_size;
-    const int elems_per_block = threads * (granularity / sizeof(T));
-    dim3 block_dims(threads);
-    dim3 grid_dims((total_count + elems_per_block - 1) / elems_per_block);
-
-   hipLaunchKernelGGL(( fused_bias_relu), dim3(grid_dims), dim3(block_dims), 0, stream, 
-        input, bias, total_count, intermediate_size);
-}
-
-template void launch_bias_relu<float>(float*, const float*, int, int, hipStream_t);
-template void launch_bias_relu<__half>(__half*, const __half*, int, int, hipStream_t);
--- a/csrc/transformer/inference/csrc/softmax.hip
+++ b/csrc/transformer/inference/csrc/softmax.hip
--- a/csrc/transformer/inference/csrc/transform.hip
+++ b/csrc/transformer/inference/csrc/transform.hip