push dsv0.8.2 version

67ea635f · aiss · 1b2721ad · 1b2721ad · 1b2721ad · 1b2721ad
Commit 67ea635f authored Mar 30, 2023 by aiss
20 changed files
--- a/deepspeed/ops/csrc/aio/py_test/run_write_sweep.sh
+++ b/deepspeed/ops/csrc/aio/py_test/run_write_sweep.sh
-#!/bin/bash
-function prep_folder()
-{
-    folder=$1
-    if [[ -d ${folder} ]]; then
-        rm -f ${folder}/*
-    else
-        mkdir -p ${folder}
-    fi
-}
-
-function validate_environment()
-{
-    validate_cmd="python ./validate_async_io.py"
-    eval ${validate_cmd}
-    res=$?
-    if [[ $res != 0 ]]; then
-        echo "Failing because environment is not properly configured"
-        echo "Possible fix: sudo apt-get install libaio-dev"
-        exit 1
-    fi
-}
-
-
-
-validate_environment
-
-if [[ $# -ne 3 ]]; then
-    echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
-    exit 1
-fi
-
-SIZE="$1M"
-WRITE_DIR=$2
-LOG_DIR=$3/aio_perf_sweep
-
-OUTPUT_FILE=${WRITE_DIR}/ds_aio_write_${SIZE}B.pt
-WRITE_OPT="--write_file ${OUTPUT_FILE} --write_size ${SIZE}"
-
-
-prep_folder ${WRITE_DIR}
-prep_folder ${LOG_DIR}
-
-RUN_SCRIPT=./test_ds_aio.py
-
-DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
-SYNC="sync"
-
-for sub in single block; do
-    if [[ $sub == "single" ]]; then
-        sub_opt="--single_submit"
-    else
-        sub_opt=""
-    fi
-    for ov in overlap sequential; do
-        if [[ $ov == "overlap" ]]; then
-            ov_opt="--overlap_events"
-        else
-            ov_opt=""
-        fi
-        for t in 1 2 4 8; do
-            for p in 1; do
-                for d in 1 2 4 8 16 32; do
-                    for bs in 128K 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
-                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
-                        LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-                        cmd="python ${RUN_SCRIPT} ${WRITE_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
-                        echo ${DISABLE_CACHE}
-                        echo ${cmd}
-                        echo ${SYNC}
-
-                        eval ${DISABLE_CACHE}
-                        eval ${cmd}
-                        eval ${SYNC}
-                        sleep 2
-                    done
-                done
-        done
-        done
-    done
-done
--- a/deepspeed/ops/csrc/aio/py_test/single_process_config.json
+++ b/deepspeed/ops/csrc/aio/py_test/single_process_config.json
-{
-    "block_size": [
-        "128K",
-        "256K",
-        "1M"
-    ],
-    "queue_depth": [
-        4,
-        16,
-        32
-    ],
-    "io_parallel": [
-        1,
-        2,
-        4,
-        8
-    ],
-    "single_submit": [
-        true,
-        false
-    ],
-    "overlap_events": [
-        true,
-        false
-    ],
-    "threads": [
-        1
-    ]
-}
--- a/deepspeed/ops/csrc/aio/py_test/test_ds_aio.py
+++ b/deepspeed/ops/csrc/aio/py_test/test_ds_aio.py
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import os
-import torch
-import argparse
-import time
-import sys
-from multiprocessing import Pool
-import multiprocessing as mp
-from ds_aio_basic import aio_basic_multiprocessing
-from ds_aio_handle import aio_handle_multiprocessing
-from test_ds_aio_utils import refine_args
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--read_file', type=str, default=None, help='Read file.')
-
-    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
-
-    parser.add_argument('--write_size',
-                        type=str,
-                        default=None,
-                        help='Number of bytes to write.')
-
-    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
-
-    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
-
-    parser.add_argument('--threads',
-                        type=int,
-                        default=1,
-                        help='Thread parallelism count.')
-
-    parser.add_argument(
-        '--single_submit',
-        action='store_true',
-        help=
-        'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
-    )
-
-    parser.add_argument('--overlap_events',
-                        action='store_true',
-                        help='Overlap I/O submission and completion requests.')
-
-    parser.add_argument('--validate',
-                        action='store_true',
-                        help='Perform validation in library.')
-
-    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
-
-    parser.add_argument('--loops',
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
-
-    parser.add_argument('--io_parallel',
-                        type=int,
-                        default=None,
-                        help='Per iop parallelism')
-
-    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args
-
-
-def validate_args(args):
-    if args.read_file and not os.path.isfile(args.read_file):
-        print(f'args validation error: {args.read_file} not found')
-        return False
-
-    return True
-
-
-def main():
-    print(f'Testing deepspeed_aio python frontend')
-
-    args = parse_arguments()
-    refine_args(args)
-    if not validate_args(args):
-        quit()
-
-    mp.set_start_method('spawn')
-    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
-    if args.read_file:
-        multiprocess_function(args, True)
-
-    if args.write_file:
-        multiprocess_function(args, False)
-
-
-if __name__ == "__main__":
-    main()
--- a/deepspeed/ops/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/deepspeed/ops/csrc/aio/py_test/test_ds_aio_utils.py
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import os
-
-BYTES_PER_GB = 1024**3
-LOG_TIDS = [0]
-
-
-def task_log(tid, msg):
-    if tid in LOG_TIDS:
-        print(f'tid {tid}: {msg}')
-
-
-def task_barrier(barrier, num_parties):
-    assert barrier.parties == num_parties
-    barrier.wait()
-    assert barrier.broken == False
-
-
-def report_results(args, read_op, pool_results):
-    #print(f'pool_results = {pool_results}')
-    io_string = 'Read' if read_op else 'Write'
-    if None in pool_results:
-        print(f'Failure in one of {args.threads} {io_string} processes')
-        return
-
-    total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
-
-    task_latency_sec = max([sec for _, sec, _ in pool_results])
-    task_speed_GB = total_bytes / task_latency_sec / BYTES_PER_GB
-    print(f'Task {io_string} Latency = {task_latency_sec} sec')
-    print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
-
-    e2e_latency_sec = max([sec for sec, _, _ in pool_results])
-    e2e_speed_GB = total_bytes / e2e_latency_sec / BYTES_PER_GB
-    print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
-    print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
-
-
-def refine_integer_value(value):
-    unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
-
-    if value[-1] in list(unit_dict.keys()):
-        int_value = int(value[:-1]) * unit_dict[value[-1]]
-        return int_value
-    return int(value)
-
-
-def refine_args(args):
-    if args.write_size and type(args.write_size) == str:
-        args.write_size = refine_integer_value(args.write_size)
-
-    if args.block_size and type(args.block_size) == str:
-        args.block_size = refine_integer_value(args.block_size)
--- a/deepspeed/ops/csrc/aio/py_test/validate_async_io.py
+++ b/deepspeed/ops/csrc/aio/py_test/validate_async_io.py
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-import deepspeed
-from deepspeed.ops.aio import AsyncIOBuilder
-assert AsyncIOBuilder().is_compatible()
--- a/deepspeed/ops/csrc/common/custom_cuda_kernel.cu
+++ b/deepspeed/ops/csrc/common/custom_cuda_kernel.cu
-#include "custom_cuda_layers.h"
-
-__global__ void param_update_kernel(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (id < size) { output[id] = (__half)input[id]; }
-}
-
-void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
-{
-    int threads = 1024;
-
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-    param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
-}
-
-__global__ void param_update_kernel_half(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    __half2* output_cast = reinterpret_cast<__half2*>(output);
-    if (id < size) {
-        float input_f = input[id];
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-        output_cast[id] = *input_h;
-    }
-}
-
-void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream)
-{
-    int threads = 1024;
-    size /= 2;
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-    param_update_kernel_half<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
-}
--- a/deepspeed/ops/csrc/common/custom_hip_kernel.hip
+++ b/deepspeed/ops/csrc/common/custom_hip_kernel.hip
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-
-__global__ void param_update_kernel(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (id < size) { output[id] = (__half)input[id]; }
-}
-
-void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
-{
-    int threads = 1024;
-
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
-}
-
-__global__ void param_update_kernel_half(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    __half2* output_cast = reinterpret_cast<__half2*>(output);
-    if (id < size) {
-        float input_f = input[id];
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-        output_cast[id] = *input_h;
-    }
-}
-
-void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream)
-{
-    int threads = 1024;
-    size /= 2;
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-   hipLaunchKernelGGL(( param_update_kernel_half), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
-}
--- a/deepspeed/ops/csrc/includes/StopWatch.h
+++ b/deepspeed/ops/csrc/includes/StopWatch.h
-#pragma once
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <time.h>
-#endif
-
-#ifdef _WIN32
-
-class Stopwatch {
-private:
-    double m_total_time;
-    LARGE_INTEGER m_start_time;
-
-public:
-    Stopwatch() { m_total_time = 0.0; }
-
-    ~Stopwatch() {}
-
-    void Reset() { m_total_time = 0.0; }
-
-    void Start() { QueryPerformanceCounter(&m_start_time); }
-
-    void Restart()
-    {
-        m_total_time = 0.0;
-        QueryPerformanceCounter(&m_start_time);
-    }
-
-    void Stop()
-    {
-        LARGE_INTEGER frequency;
-        LARGE_INTEGER stop_time;
-        QueryPerformanceFrequency(&frequency);
-        QueryPerformanceCounter(&stop_time);
-        m_total_time +=
-            ((double)(stop_time.QuadPart - m_start_time.QuadPart) / (double)frequency.QuadPart);
-    }
-
-    double GetTimeInSeconds() { return m_total_time; }
-};
-
-#else
-
-class Stopwatch {
-private:
-    double m_total_time;
-    struct timespec m_start_time;
-    bool m_is_started;
-
-public:
-    Stopwatch()
-    {
-        m_total_time = 0.0;
-        m_is_started = false;
-    }
-
-    ~Stopwatch() {}
-
-    void Reset() { m_total_time = 0.0; }
-
-    void Start()
-    {
-        clock_gettime(CLOCK_MONOTONIC, &m_start_time);
-        m_is_started = true;
-    }
-
-    void Restart()
-    {
-        m_total_time = 0.0;
-        clock_gettime(CLOCK_MONOTONIC, &m_start_time);
-        m_is_started = true;
-    }
-
-    void Stop()
-    {
-        if (m_is_started) {
-            m_is_started = false;
-
-            struct timespec end_time;
-            clock_gettime(CLOCK_MONOTONIC, &end_time);
-
-            m_total_time += (double)(end_time.tv_sec - m_start_time.tv_sec) +
-                            (double)(end_time.tv_nsec - m_start_time.tv_nsec) / 1e9;
-        }
-    }
-
-    double GetTimeInSeconds()
-    {
-        if (m_is_started) {
-            Stop();
-            Start();
-        }
-        return m_total_time;
-    }
-};
-
-#endif
--- a/deepspeed/ops/csrc/includes/Timer.h
+++ b/deepspeed/ops/csrc/includes/Timer.h
-
-#ifndef __TIMER_H__
-#define __TIMER_H__
-
-#include <cuda_runtime.h>
-#include <chrono>
-#include "cuda.h"
-
-class GPUTimer {
-    cudaEvent_t start, stop;
-
-public:
-    GPUTimer()
-    {
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-    }
-    ~GPUTimer()
-    {
-        cudaEventDestroy(start);
-        cudaEventDestroy(stop);
-    }
-    inline void Record() { cudaEventRecord(start); }
-    inline void Elapsed(float& time_elapsed)
-    {
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-        cudaEventElapsedTime(&time_elapsed, start, stop);
-    }
-};
-
-class CPUTimer {
-    std::chrono::high_resolution_clock::time_point start;
-
-public:
-    CPUTimer() : start(std::chrono::high_resolution_clock::now()) {}
-    inline void Reset() { start = std::chrono::high_resolution_clock::now(); }
-    inline float Elapsed()
-    {
-        auto temp = start;
-        start = std::chrono::high_resolution_clock::now();
-        return (float)(std::chrono::duration_cast<std::chrono::microseconds>(start - temp).count() /
-                       1e3);
-    }
-};
-
-#endif
--- a/deepspeed/ops/csrc/includes/Timer_hip.h
+++ b/deepspeed/ops/csrc/includes/Timer_hip.h
-// !!! This is a file automatically generated by hipify!!!
-
-#ifndef __TIMER_H__
-#define __TIMER_H__
-
-#include <hip/hip_runtime.h>
-#include <chrono>
-#include "hip/hip_runtime.h"
-
-class GPUTimer {
-    hipEvent_t start, stop;
-
-public:
-    GPUTimer()
-    {
-        hipEventCreate(&start);
-        hipEventCreate(&stop);
-    }
-    ~GPUTimer()
-    {
-        hipEventDestroy(start);
-        hipEventDestroy(stop);
-    }
-    inline void Record() { hipEventRecord(start); }
-    inline void Elapsed(float& time_elapsed)
-    {
-        hipEventRecord(stop);
-        hipEventSynchronize(stop);
-        hipEventElapsedTime(&time_elapsed, start, stop);
-    }
-};
-
-class CPUTimer {
-    std::chrono::high_resolution_clock::time_point start;
-
-public:
-    CPUTimer() : start(std::chrono::high_resolution_clock::now()) {}
-    inline void Reset() { start = std::chrono::high_resolution_clock::now(); }
-    inline float Elapsed()
-    {
-        auto temp = start;
-        start = std::chrono::high_resolution_clock::now();
-        return (float)(std::chrono::duration_cast<std::chrono::microseconds>(start - temp).count() /
-                       1e3);
-    }
-};
-
-#endif
--- a/deepspeed/ops/csrc/includes/compat.h
+++ b/deepspeed/ops/csrc/includes/compat.h
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#ifndef TORCH_CHECK
-#define TORCH_CHECK AT_CHECK
-#endif
-
-#ifdef VERSION_GE_1_3
-#define DATA_PTR data_ptr
-#else
-#define DATA_PTR data
-#endif
--- a/deepspeed/ops/csrc/includes/context.h
+++ b/deepspeed/ops/csrc/includes/context.h
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime_api.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "gemm_test.h"
-
-#define WARP_SIZE 32
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        cudaError_t error_code = callstr;                                                      \
-        if (error_code != cudaSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
-        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
-
-#define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 262144
-
-inline int DS_GET_BLOCKS(const int N)
-{
-    return (std::max)(
-        (std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
-        // Use at least 1 block, since CUDA does not allow empty block
-        1);
-}
-
-class Context {
-public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
-    {
-        curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
-        curandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
-            auto message = std::string("Fail to create cublas handle.");
-            std::cerr << message << std::endl;
-            throw std::runtime_error(message);
-        }
-    }
-
-    virtual ~Context()
-    {
-        cublasDestroy(_cublasHandle);
-        cudaFree(_workspace);
-    }
-
-    static Context& Instance()
-    {
-        static Context _ctx;
-        return _ctx;
-    }
-
-    void SetWorkSpace(void* workspace)
-    {
-        if (!workspace) { throw std::runtime_error("Workspace is null."); }
-        _workspace = workspace;
-    }
-
-    void* GetWorkSpace() { return _workspace; }
-
-    curandGenerator_t& GetRandGenerator() { return _gen; }
-
-    cudaStream_t GetCurrentStream()
-    {
-        // get current pytorch stream.
-        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        return stream;
-    }
-
-    cudaStream_t GetNewStream() { return at::cuda::getStreamFromPool(); }
-
-    cublasHandle_t GetCublasHandle() { return _cublasHandle; }
-
-    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
-    {
-        uint64_t offset = _curr_offset;
-        _curr_offset += offset_inc;
-        return std::pair<uint64_t, uint64_t>(_seed, offset);
-    }
-
-    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
-
-    void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head)
-    {
-        // avoid rerun.
-        if (_gemm_algos.size() > 0) return;
-
-        if (test_gemm) {
-            cublasHandle_t handle = GetCublasHandle();
-
-            std::unique_ptr<GemmTest<__half>> test_qkv_fw(
-                new GemmTest<__half>(batch_size * seq_len,      // M
-                                     head_num * size_per_head,  // N
-                                     head_num * size_per_head,  // K
-                                     CUBLAS_OP_T,
-                                     CUBLAS_OP_N,
-                                     handle));
-
-            std::unique_ptr<GemmTest<__half>> test_inter(
-                new GemmTest<__half>(batch_size * seq_len,          // M
-                                     4 * head_num * size_per_head,  // N
-                                     head_num * size_per_head,      // K
-                                     CUBLAS_OP_T,
-                                     CUBLAS_OP_N,
-                                     handle));
-
-            std::unique_ptr<GemmTest<__half>> test_output(
-                new GemmTest<__half>(batch_size * seq_len,          // M
-                                     head_num * size_per_head,      // N
-                                     4 * head_num * size_per_head,  // K
-                                     CUBLAS_OP_T,
-                                     CUBLAS_OP_N,
-                                     handle));
-
-            std::unique_ptr<StridedGemmTest<__half>> test_attn_scores(
-                new StridedGemmTest<__half>(batch_size * head_num,  // batch
-                                            seq_len,                // M
-                                            seq_len,                // N
-                                            size_per_head,          // K
-                                            CUBLAS_OP_T,
-                                            CUBLAS_OP_N,
-                                            handle));
-
-            std::unique_ptr<StridedGemmTest<__half>> test_attn_context(
-                new StridedGemmTest<__half>(batch_size * head_num,  // batch
-                                            size_per_head,          // M
-                                            seq_len,                // N
-                                            seq_len,                // K
-                                            CUBLAS_OP_N,
-                                            CUBLAS_OP_N,
-                                            handle));
-
-            _gemm_algos.push_back(test_qkv_fw->TestAlgo(100));
-            _gemm_algos.push_back(test_inter->TestAlgo(100));
-            _gemm_algos.push_back(test_output->TestAlgo(100));
-            _gemm_algos.push_back(test_attn_scores->TestAlgo(100));
-            _gemm_algos.push_back(test_attn_context->TestAlgo(100));
-        } else {
-            // Use default algo.
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-        }
-    }
-
-    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
-
-private:
-    curandGenerator_t _gen;
-    cublasHandle_t _cublasHandle;
-    void* _workspace;
-    uint64_t _seed;
-    uint64_t _curr_offset;
-    std::vector<std::array<int, 3>> _gemm_algos;
-};
--- a/deepspeed/ops/csrc/includes/context_hip.h
+++ b/deepspeed/ops/csrc/includes/context_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <ATen/hip/HIPContext.h>
-#include <hip/hip_runtime_api.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
-#include "gemm_test_hip.h"
-
-#define WARP_SIZE 32
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        hipError_t error_code = callstr;                                                      \
-        if (error_code != hipSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
-        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
-
-#define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 262144
-
-inline int DS_GET_BLOCKS(const int N)
-{
-    return (std::max)(
-        (std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
-        // Use at least 1 block, since CUDA does not allow empty block
-        1);
-}
-
-class Context {
-public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
-    {
-        hiprandCreateGenerator(&_gen, HIPRAND_RNG_PSEUDO_DEFAULT);
-        hiprandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (rocblas_create_handle(&_cublasHandle) != rocblas_status_success) {
-            auto message = std::string("Fail to create cublas handle.");
-            std::cerr << message << std::endl;
-            throw std::runtime_error(message);
-        }
-    }
-
-    virtual ~Context()
-    {
-        rocblas_destroy_handle(_cublasHandle);
-        hipFree(_workspace);
-    }
-
-    static Context& Instance()
-    {
-        static Context _ctx;
-        return _ctx;
-    }
-
-    void SetWorkSpace(void* workspace)
-    {
-        if (!workspace) { throw std::runtime_error("Workspace is null."); }
-        _workspace = workspace;
-    }
-
-    void* GetWorkSpace() { return _workspace; }
-
-    hiprandGenerator_t& GetRandGenerator() { return _gen; }
-
-    hipStream_t GetCurrentStream()
-    {
-        // get current pytorch stream.
-        hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-        return stream;
-    }
-
-    hipStream_t GetNewStream() { return at::hip::getStreamFromPoolMasqueradingAsCUDA(); }
-
-    rocblas_handle GetCublasHandle() { return _cublasHandle; }
-
-    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
-    {
-        uint64_t offset = _curr_offset;
-        _curr_offset += offset_inc;
-        return std::pair<uint64_t, uint64_t>(_seed, offset);
-    }
-
-    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
-
-    void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head)
-    {
-        // avoid rerun.
-        if (_gemm_algos.size() > 0) return;
-
-        if (test_gemm) {
-            rocblas_handle handle = GetCublasHandle();
-
-            std::unique_ptr<GemmTest<__half>> test_qkv_fw(
-                new GemmTest<__half>(batch_size * seq_len,      // M
-                                     head_num * size_per_head,  // N
-                                     head_num * size_per_head,  // K
-                                     rocblas_operation_transpose,
-                                     rocblas_operation_none,
-                                     handle));
-
-            std::unique_ptr<GemmTest<__half>> test_inter(
-                new GemmTest<__half>(batch_size * seq_len,          // M
-                                     4 * head_num * size_per_head,  // N
-                                     head_num * size_per_head,      // K
-                                     rocblas_operation_transpose,
-                                     rocblas_operation_none,
-                                     handle));
-
-            std::unique_ptr<GemmTest<__half>> test_output(
-                new GemmTest<__half>(batch_size * seq_len,          // M
-                                     head_num * size_per_head,      // N
-                                     4 * head_num * size_per_head,  // K
-                                     rocblas_operation_transpose,
-                                     rocblas_operation_none,
-                                     handle));
-
-            std::unique_ptr<StridedGemmTest<__half>> test_attn_scores(
-                new StridedGemmTest<__half>(batch_size * head_num,  // batch
-                                            seq_len,                // M
-                                            seq_len,                // N
-                                            size_per_head,          // K
-                                            rocblas_operation_transpose,
-                                            rocblas_operation_none,
-                                            handle));
-
-            std::unique_ptr<StridedGemmTest<__half>> test_attn_context(
-                new StridedGemmTest<__half>(batch_size * head_num,  // batch
-                                            size_per_head,          // M
-                                            seq_len,                // N
-                                            seq_len,                // K
-                                            rocblas_operation_none,
-                                            rocblas_operation_none,
-                                            handle));
-
-            _gemm_algos.push_back(test_qkv_fw->TestAlgo(100));
-            _gemm_algos.push_back(test_inter->TestAlgo(100));
-            _gemm_algos.push_back(test_output->TestAlgo(100));
-            _gemm_algos.push_back(test_attn_scores->TestAlgo(100));
-            _gemm_algos.push_back(test_attn_context->TestAlgo(100));
-        } else {
-            // Use default algo.
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-            _gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
-        }
-    }
-
-    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
-
-private:
-    hiprandGenerator_t _gen;
-    rocblas_handle _cublasHandle;
-    void* _workspace;
-    uint64_t _seed;
-    uint64_t _curr_offset;
-    std::vector<std::array<int, 3>> _gemm_algos;
-};
--- a/deepspeed/ops/csrc/includes/cpu_adagrad.h
+++ b/deepspeed/ops/csrc/includes/cpu_adagrad.h
-#pragma once
-
-#define NOMINMAX  // Windows idiosyncrasy
-                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "cuda.h"
-#include "custom_cuda_layers.h"
-#include "simd.h"
-
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-
-class Adagrad_Optimizer {
-public:
-    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
-        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
-    {
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adagrad_Optimizer()
-    {
-        cudaFreeHost(_doubled_buffer[0]);
-        cudaFreeHost(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step)
-    {
-        _step++;
-        if (_step != step) { _step = step; }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-    }
-
-private:
-    float _alpha;
-    float _eps;
-    float _weight_decay;
-
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-
-    float* _doubled_buffer[2];
-    bool _buf_index;
-
-    cudaStream_t _streams[2];
-};
-
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
-                                 float* _params,
-                                 float* grads,
-                                 float* _exp_avg_sq,
-                                 size_t _param_size,
-                                 __half* dev_params,
-                                 bool half_precision)
-{
-    size_t new_rounded_size = 0;
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-
-    float step_size = -1 * _alpha;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, grads + i, false);
-
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-
-            if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
-
-            simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_add<span>(grad_4, grad_4, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
--- a/deepspeed/ops/csrc/includes/cpu_adagrad_hip.h
+++ b/deepspeed/ops/csrc/includes/cpu_adagrad_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#define NOMINMAX  // Windows idiosyncrasy
-                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-#include "simd.h"
-
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-
-class Adagrad_Optimizer {
-public:
-    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
-        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
-    {
-        hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
-        hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adagrad_Optimizer()
-    {
-        hipHostFree(_doubled_buffer[0]);
-        hipHostFree(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step)
-    {
-        _step++;
-        if (_step != step) { _step = step; }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-    }
-
-private:
-    float _alpha;
-    float _eps;
-    float _weight_decay;
-
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-
-    float* _doubled_buffer[2];
-    bool _buf_index;
-
-    hipStream_t _streams[2];
-};
-
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
-                                 float* _params,
-                                 float* grads,
-                                 float* _exp_avg_sq,
-                                 size_t _param_size,
-                                 __half* dev_params,
-                                 bool half_precision)
-{
-    size_t new_rounded_size = 0;
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-
-    float step_size = -1 * _alpha;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, grads + i, false);
-
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-
-            if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
-
-            simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_add<span>(grad_4, grad_4, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
--- a/deepspeed/ops/csrc/includes/cpu_adam.h
+++ b/deepspeed/ops/csrc/includes/cpu_adam.h
-#pragma once
-
-#define NOMINMAX  // Windows idiosyncrasy
-                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "cuda.h"
-#include "custom_cuda_layers.h"
-#include "simd.h"
-
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg,             \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-
-class Adam_Optimizer {
-public:
-    Adam_Optimizer(float alpha = 1e-3,
-                   float betta1 = 0.9,
-                   float betta2 = 0.999,
-                   float eps = 1e-8,
-                   float weight_decay = 0,
-                   bool adamw_mode = true)
-        : _alpha(alpha),
-          _betta1(betta1),
-          _betta2(betta2),
-          _eps(eps),
-          _weight_decay(weight_decay),
-          _betta1_t(1.0),
-          _betta2_t(1.0),
-          _step(0),
-          _buf_index(false),
-          _adamw_mode(adamw_mode)
-    {
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adam_Optimizer()
-    {
-        cudaFreeHost(_doubled_buffer[0]);
-        cudaFreeHost(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step, float beta1, float beta2)
-    {
-        if (beta1 != _betta1 || beta2 != _betta2) {
-            _step = step;
-            _betta1 = beta1;
-            _betta2 = beta2;
-            _betta1_t = std::pow(_betta1, step);
-            _betta2_t = std::pow(_betta2, step);
-        } else {
-            _step++;
-            if (_step != step) {
-                _betta1_t = std::pow(_betta1, step);
-                _betta2_t = std::pow(_betta2, step);
-                _step = step;
-            } else {
-                _betta1_t *= _betta1;
-                _betta2_t *= _betta2;
-            }
-        }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-
-        _bias_correction1 = 1.0f;
-        _bias_correction2 = 1.0f;
-        if (bias_correction == 1) {
-            _bias_correction1 = 1 - _betta1_t;
-            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
-        }
-    }
-
-private:
-    float _alpha;
-    float _betta1;
-    float _betta2;
-    float _eps;
-    float _weight_decay;
-
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-
-    float _bias_correction1;
-    float _bias_correction2;
-
-    float* _doubled_buffer[2];
-    bool _buf_index;
-    bool _adamw_mode;
-
-    cudaStream_t _streams[2];
-};
-
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adam_Optimizer::Step_AVX(size_t* rounded_size,
-                              float* _params,
-                              float* grads,
-                              float* _exp_avg,
-                              float* _exp_avg_sq,
-                              size_t _param_size,
-                              __half* dev_params,
-                              bool half_precision)
-{
-    size_t new_rounded_size = 0;
-
-    AVX_Data betta1_4;
-    betta1_4.data = SIMD_SET(_betta1);
-    AVX_Data betta2_4;
-    betta2_4.data = SIMD_SET(_betta2);
-
-    float betta1_minus1 = 1 - _betta1;
-    float betta2_minus1 = 1 - _betta2;
-    AVX_Data betta1_minus1_4;
-    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
-    AVX_Data betta2_minus1_4;
-    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
-
-    AVX_Data bias2_sqrt;
-    bias2_sqrt.data = SIMD_SET(_bias_correction2);
-
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-
-    float step_size = -1 * _alpha / _bias_correction1;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-
-    float w_decay = -1 * _alpha * _weight_decay;
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0)
-        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, _exp_avg + i, false);
-
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-
-            if (_weight_decay > 0 && !_adamw_mode) {
-                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
-            }
-
-            simd_mul<span>(momentum_4, momentum_4, betta1_4);
-            simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
-            simd_mul<span>(variance_4, variance_4, betta2_4);
-            simd_mul<span>(grad_4, grad_4, grad_4);
-            simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-
-            if (_weight_decay > 0 && _adamw_mode) {
-                simd_fma<span>(param_4, param_4, weight_decay4, param_4);
-            }
-
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg + i, momentum_4, false);
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
--- a/deepspeed/ops/csrc/includes/cpu_adam_hip.h
+++ b/deepspeed/ops/csrc/includes/cpu_adam_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#define NOMINMAX  // Windows idiosyncrasy
-                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
-
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime_api.h>
-#include <stdio.h>
-#include <cassert>
-#include "hip/hip_runtime.h"
-#include "custom_hip_layers.h"
-#include "simd.h"
-
-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg,             \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
-                     bool half_precision = false);
-
-class Adam_Optimizer {
-public:
-    Adam_Optimizer(float alpha = 1e-3,
-                   float betta1 = 0.9,
-                   float betta2 = 0.999,
-                   float eps = 1e-8,
-                   float weight_decay = 0,
-                   bool adamw_mode = true)
-        : _alpha(alpha),
-          _betta1(betta1),
-          _betta2(betta2),
-          _eps(eps),
-          _weight_decay(weight_decay),
-          _betta1_t(1.0),
-          _betta2_t(1.0),
-          _step(0),
-          _buf_index(false),
-          _adamw_mode(adamw_mode)
-    {
-        hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
-        hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
-    }
-    ~Adam_Optimizer()
-    {
-        hipHostFree(_doubled_buffer[0]);
-        hipHostFree(_doubled_buffer[1]);
-    }
-#if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
-    void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  __half* dev_param = nullptr,
-                  bool half_precision = false);
-#endif
-    STEP(1)
-    STEP(4)
-    STEP(8)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
-    }
-    inline void IncrementStep(size_t step, float beta1, float beta2)
-    {
-        if (beta1 != _betta1 || beta2 != _betta2) {
-            _step = step;
-            _betta1 = beta1;
-            _betta2 = beta2;
-            _betta1_t = std::pow(_betta1, step);
-            _betta2_t = std::pow(_betta2, step);
-        } else {
-            _step++;
-            if (_step != step) {
-                _betta1_t = std::pow(_betta1, step);
-                _betta2_t = std::pow(_betta2, step);
-                _step = step;
-            } else {
-                _betta1_t *= _betta1;
-                _betta2_t *= _betta2;
-            }
-        }
-    }
-    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
-    {
-        _alpha = lr;
-        _eps = epsilon;
-        _weight_decay = weight_decay;
-
-        _bias_correction1 = 1.0f;
-        _bias_correction2 = 1.0f;
-        if (bias_correction == 1) {
-            _bias_correction1 = 1 - _betta1_t;
-            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
-        }
-    }
-
-private:
-    float _alpha;
-    float _betta1;
-    float _betta2;
-    float _eps;
-    float _weight_decay;
-
-    float _betta1_t;
-    float _betta2_t;
-    size_t _step;
-
-    float _bias_correction1;
-    float _bias_correction2;
-
-    float* _doubled_buffer[2];
-    bool _buf_index;
-    bool _adamw_mode;
-
-    hipStream_t _streams[2];
-};
-
-#if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
-void Adam_Optimizer::Step_AVX(size_t* rounded_size,
-                              float* _params,
-                              float* grads,
-                              float* _exp_avg,
-                              float* _exp_avg_sq,
-                              size_t _param_size,
-                              __half* dev_params,
-                              bool half_precision)
-{
-    size_t new_rounded_size = 0;
-
-    AVX_Data betta1_4;
-    betta1_4.data = SIMD_SET(_betta1);
-    AVX_Data betta2_4;
-    betta2_4.data = SIMD_SET(_betta2);
-
-    float betta1_minus1 = 1 - _betta1;
-    float betta2_minus1 = 1 - _betta2;
-    AVX_Data betta1_minus1_4;
-    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
-    AVX_Data betta2_minus1_4;
-    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
-
-    AVX_Data bias2_sqrt;
-    bias2_sqrt.data = SIMD_SET(_bias_correction2);
-
-    AVX_Data eps_4;
-    eps_4.data = SIMD_SET(_eps);
-
-    float step_size = -1 * _alpha / _bias_correction1;
-    AVX_Data step_size_4;
-    step_size_4.data = SIMD_SET(step_size);
-
-    float w_decay = -1 * _alpha * _weight_decay;
-    AVX_Data weight_decay4;
-    if (_weight_decay > 0)
-        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
-    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
-    for (size_t t = 0; t < new_rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
-        size_t offset = copy_size + t;
-        if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
-            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
-
-            AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, _exp_avg + i, false);
-
-            AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
-
-            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
-
-            if (_weight_decay > 0 && !_adamw_mode) {
-                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
-            }
-
-            simd_mul<span>(momentum_4, momentum_4, betta1_4);
-            simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
-            simd_mul<span>(variance_4, variance_4, betta2_4);
-            simd_mul<span>(grad_4, grad_4, grad_4);
-            simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
-            simd_sqrt<span>(grad_4, variance_4);
-            simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
-            simd_div<span>(grad_4, momentum_4, grad_4);
-
-            if (_weight_decay > 0 && _adamw_mode) {
-                simd_fma<span>(param_4, param_4, weight_decay4, param_4);
-            }
-
-            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
-
-            simd_store<span>(_params + i, param_4, half_precision);
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-            simd_store<span>(_exp_avg + i, momentum_4, false);
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-    }
-    *rounded_size = new_rounded_size;
-}
-#endif
--- a/deepspeed/ops/csrc/includes/cublas_wrappers.h
+++ b/deepspeed/ops/csrc/includes/cublas_wrappers.h
-#pragma once
-
-#include <assert.h>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#ifndef __HIP_PLATFORM_HCC__
-#include <mma.h>
-#endif
-#include <stdio.h>
-
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-#endif
-
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-#endif
-
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
--- a/deepspeed/ops/csrc/includes/cublas_wrappers_hip.h
+++ b/deepspeed/ops/csrc/includes/cublas_wrappers_hip.h
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-
-#include <assert.h>
-#include <rocblas.h>
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#ifndef __HIP_PLATFORM_HCC__
-#include <mma.h>
-#endif
-#include <stdio.h>
-
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-#endif
-
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-#ifdef __HIP_PLATFORM_HCC__
-                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
-
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
-#endif
-
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-#ifdef __HIP_PLATFORM_HCC__
-                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
-#else
-                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-#endif
--- a/deepspeed/ops/csrc/includes/custom_cuda_layers.h
+++ b/deepspeed/ops/csrc/includes/custom_cuda_layers.h
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-#include <curand_kernel.h>
-
-#include "context.h"
-#include "cublas_wrappers.h"
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        cudaError_t error_code = callstr;                                                      \
-        if (error_code != cudaSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define MAX_THREADS 1024
-#define THREADS 256
-
-#define MAX_THREAD_STRIDE 32
-#define TILE_DIM 32
-
-// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
-// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
-#define MAX_THREAD_ITERATIONS 8  // Maximum 8K
-#define MAX_WARP_NUM 32
-
-#define MAX_REGISTERS 256
-
-#define MAX_REG 256
-
-#define WARP_SIZE_BITS 5
-
-template <typename T>
-void launch_quantize_kernel(T* vals,
-                            int total_count,
-                            int group_num,
-                            int num_bits,
-                            cudaStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel(T* vals,
-                               int total_count,
-                               int group_num,
-                               int num_bits,
-                               cudaStream_t stream);
-template <typename T>
-void launch_quantize_kernel_asym(T* vals,
-                                 int total_count,
-                                 int group_num,
-                                 int num_bits,
-                                 cudaStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel_asym(T* vals,
-                                    int total_count,
-                                    int group_num,
-                                    int num_bits,
-                                    cudaStream_t stream);
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(const T* input,
-                      const T* bias,
-                      T* output,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream);
-
-template <typename T>
-void launch_gelu(const T* input,
-                 T* output,
-                 int intermediate_size,
-                 int batch_size,
-                 cudaStream_t stream);
-
-template <typename T>
-void launch_d_gelu(T* d_output,
-                   const T* input,
-                   const T* bias,
-                   int intermediate_size,
-                   int batch_size,
-                   cudaStream_t stream);
-
-// Custom fused bias add with layer normalization
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars,
-                                     T* means);
-
-template <typename T>
-void launch_bias_residual_layer_norm(T* vals,
-                                     const T* residual,
-                                     const T* gamma,
-                                     const T* beta,
-                                     float epsilon,
-                                     int batch_size,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     bool preLayerNorm,
-                                     bool training,
-                                     T* vars);
-
-template <typename T>
-void launch_layerNorm_backward_fused_add(const T* out_grad1,
-                                         const T* out_grad2,
-                                         const T* X_data,
-                                         const T* vars,
-                                         const T* means,
-                                         const T* gamma,
-                                         T* gamma_grad,
-                                         T* betta_grad,
-                                         T* inp_grad,
-                                         int batch_size,
-                                         int hidden_dim,
-                                         cudaStream_t stream[2]);
-template <typename T>
-void launch_layerNorm_backward_fused_add(const T* out_grad1,
-                                         const T* out_grad2,
-                                         const T* vals_hat,
-                                         const T* vars,
-                                         const T* gamma,
-                                         T* gamma_grad,
-                                         T* betta_grad,
-                                         T* inp_grad,
-                                         int batch_size,
-                                         int hidden_dim,
-                                         cudaStream_t stream[2],
-                                         bool invertible = false,
-                                         const T* betta = nullptr);
-
-template <typename T>
-void launch_layerNorm_backward(const T* out_grad,
-                               const T* X_data,
-                               const T* vars,
-                               const T* means,
-                               const T* gamma,
-                               T* gamma_grad,
-                               T* betta_grad,
-                               T* inp_grad,
-                               int batch_size,
-                               int hidden_dim,
-                               cudaStream_t stream[2]);
-
-template <typename T>
-void launch_layerNorm_backward(const T* out_grad,
-                               const T* vals_hat,
-                               const T* vars,
-                               const T* gamma,
-                               T* gamma_grad,
-                               T* betta_grad,
-                               T* inp_grad,
-                               int batch_size,
-                               int hidden_dim,
-                               cudaStream_t stream[2],
-                               bool invertible = false,
-                               const T* betta = nullptr);
-
-template <typename T>
-void launch_layerNorm_backward_nreversible(const T* out_grad,
-                                           const T* vals,
-                                           const T* out_grad_trans,
-                                           const T* vals_trans,
-                                           const T* means,
-                                           const T* vars,
-                                           const T* gamma,
-                                           T* gamma_grad,
-                                           T* betta_grad,
-                                           T* inp_grad,
-                                           int batch_size,
-                                           int hidden_dim,
-                                           cudaStream_t stream[2]);
-
-template <typename T>
-void Transpose(const T* inp_mat, T* out_mat, int rows, int cols, cudaStream_t stream);
-
-template <typename T>
-void launch_attn_softmax_backward(T* out_grad,
-                                  const T* soft_inp,
-                                  int batch_size,
-                                  int heads,
-                                  int seq_length,
-                                  cudaStream_t stream);
-
-template <typename T>
-void launch_attn_softmax_backward_v2(T* out_grad,
-                                     const T* soft_inp,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     cudaStream_t stream);
-
-// Custom softmax with scaling and attention mask addition
-template <typename T>
-void launch_attn_softmax(T* vals,
-                         const T* attn_mask,
-                         int batch_size,
-                         int heads,
-                         int sequence_length,
-                         cudaStream_t stream);
-
-template <typename T>
-void launch_transform_0213(T* output,
-                           const T* vals,
-                           int batch_size,
-                           int seq_length,
-                           int hidden_dim,
-                           int heads,
-                           cudaStream_t stream);
-
-// Custom bias add
-template <typename T>
-void launch_bias_add_transform_0213(T* outputs,
-                                    const T* vals,
-                                    const T* bias,
-                                    int batch_size,
-                                    int seq_length,
-                                    int hidden_dim,
-                                    int heads,
-                                    cudaStream_t stream,
-                                    int trans_count);
-
-// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
-template <typename T>
-void launch_transform4d_0213(T* out,
-                             const T* in,
-                             int batch_size,
-                             int heads,
-                             int seq_length,
-                             int hidden_dim,
-                             cudaStream_t stream,
-                             int trans_count);
-
-template <typename T>
-void launch_dropout(T* vals,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream);
-
-template <typename T>
-void launch_dropout(T* vals_out,
-                    const T* vals,
-                    uint8_t* mask,
-                    int total_count,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream,
-                    bool bwd = false);
-
-template <typename T>
-void launch_dropout(T* out,
-                    const T* vals,
-                    const T* residual,
-                    const T* bias,
-                    uint8_t* mask,
-                    int batch,
-                    int dim,
-                    float ratio,
-                    cudaStream_t stream);
-
-template <typename T>
-void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, cudaStream_t stream);
-
-template <typename T>
-void launch_dropout_grad(T* vals_out,
-                         const T* vals,
-                         uint8_t* mask,
-                         int total_count,
-                         float ratio,
-                         cudaStream_t stream);
-
-template <typename T>
-void launch_fuse_transpose_bias_kernel(const T* inp,
-                                       T* out,
-                                       int rows,
-                                       int cols,
-                                       cudaStream_t stream);
-
-void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
-void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream);