push dsv0.8.2 version

67ea635f · aiss · 1b2721ad · 67ea635f · 67ea635f · 67ea635f
Commit 67ea635f authored Mar 30, 2023 by aiss
20 changed files
--- a/deepspeed/ops/__init__.py
+++ b/deepspeed/ops/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from . import adam
 from . import adagrad
 from . import lamb

--- a/deepspeed/ops/adagrad/__init__.py
+++ b/deepspeed/ops/adagrad/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .cpu_adagrad import DeepSpeedCPUAdagrad
--- a/deepspeed/ops/adagrad/cpu_adagrad.py
+++ b/deepspeed/ops/adagrad/cpu_adagrad.py
@@ -2,11 +2,8 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 '''

-import math
 import torch
-import time
-from pathlib import Path
-from ..op_builder import CPUAdagradBuilder
+from deepspeed.ops.op_builder import CPUAdagradBuilder
 from deepspeed.utils.logging import should_log_le



--- a/deepspeed/ops/adam/__init__.py
+++ b/deepspeed/ops/adam/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from .cpu_adam import DeepSpeedCPUAdam
 from .fused_adam import FusedAdam
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -2,12 +2,11 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 '''

-import math
 import torch
-import time
-from pathlib import Path
-from ..op_builder import CPUAdamBuilder
+from cpuinfo import get_cpu_info
+from deepspeed.utils import logger
 from deepspeed.utils.logging import should_log_le
+from deepspeed.ops.op_builder import CPUAdamBuilder


 class DeepSpeedCPUAdam(torch.optim.Optimizer):
@@ -76,6 +75,20 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
                            amsgrad=amsgrad)
        super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)

+        cpu_info = get_cpu_info()
+        self.cpu_vendor = cpu_info["vendor_id_raw"].lower(
+        ) if "vendor_id_raw" in cpu_info else "unknown"
+        if "amd" in self.cpu_vendor:
+            for group_id, group in enumerate(self.param_groups):
+                for param_id, p in enumerate(group['params']):
+                    if p.dtype == torch.half:
+                        logger.warning(
+                            "FP16 params for CPUAdam may not work on AMD CPUs")
+                        break
+                else:
+                    continue
+                break
+
        self.opt_id = DeepSpeedCPUAdam.optimizer_id
        DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
        self.adam_w_mode = adamw_mode

--- a/deepspeed/ops/adam/fused_adam.py
+++ b/deepspeed/ops/adam/fused_adam.py
@@ -6,11 +6,11 @@ This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 '''

 import torch
-import importlib
 from .multi_tensor_apply import MultiTensorApply

 multi_tensor_applier = MultiTensorApply(2048 * 32)
-from ..op_builder import FusedAdamBuilder
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import FusedAdamBuilder


 class FusedAdam(torch.optim.Optimizer):
@@ -72,7 +72,7 @@ class FusedAdam(torch.optim.Optimizer):

        fused_adam_cuda = FusedAdamBuilder().load()
        # Skip buffer
-        self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+        self._dummy_overflow_buf = get_accelerator().IntTensor([0])
        self.multi_tensor_adam = fused_adam_cuda.multi_tensor_adam

    def zero_grad(self):
@@ -109,12 +109,8 @@ class FusedAdam(torch.optim.Optimizer):
            bias_correction = 1 if group['bias_correction'] else 0
            beta1, beta2 = group['betas']

-            # assume same step across group now to simplify things
-            # per parameter step can be easily support by making it tensor, or pass list into kernel
-            if 'step' in group:
-                group['step'] += 1
-            else:
-                group['step'] = 1
+            if 'step' not in group:
+                group['step'] = 0

            # create lists for multi-tensor apply
            g_16, p_16, m_16, v_16 = [], [], [], []
@@ -131,6 +127,10 @@ class FusedAdam(torch.optim.Optimizer):
                state = self.state[p]
                # State initialization
                if len(state) == 0:
+                    # DeepSpeed ZeRO 3 processes each subgroup a time, so we need to keep tracking step count for each tensor separately.
+                    # While this is not an issue for ZeRO 1 & 2, since they apply a single optimizatin step to the whole param group at the same time.
+                    # In order to keep backward compatibility for the existing checkpoints, we use group['state'] to initialize state['step'] if it exists.
+                    state['step'] = group.get('step', 0)
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
@@ -150,6 +150,7 @@ class FusedAdam(torch.optim.Optimizer):
                    raise RuntimeError('FusedAdam only support fp16 and fp32.')

            if (len(g_16) > 0):
+                state['step'] += 1
                multi_tensor_applier(self.multi_tensor_adam,
                                     self._dummy_overflow_buf,
                                     [g_16,
@@ -160,11 +161,12 @@ class FusedAdam(torch.optim.Optimizer):
                                     beta1,
                                     beta2,
                                     group['eps'],
-                                     group['step'],
+                                     state['step'],
                                     self.adam_w_mode,
                                     bias_correction,
                                     group['weight_decay'])
            if (len(g_32) > 0):
+                state['step'] += 1
                multi_tensor_applier(self.multi_tensor_adam,
                                     self._dummy_overflow_buf,
                                     [g_32,
@@ -175,7 +177,7 @@ class FusedAdam(torch.optim.Optimizer):
                                     beta1,
                                     beta2,
                                     group['eps'],
-                                     group['step'],
+                                     state['step'],
                                     self.adam_w_mode,
                                     bias_correction,
                                     group['weight_decay'])

--- a/deepspeed/ops/adam/multi_tensor_apply.py
+++ b/deepspeed/ops/adam/multi_tensor_apply.py
@@ -4,7 +4,6 @@ Copyright 2020 The Microsoft DeepSpeed Team
 Copyright NVIDIA/apex
 This file is adapted from NVIDIA/apex, commit a109f85
 '''
-import torch


 class MultiTensorApply(object):

--- a/deepspeed/ops/aio/__init__.py
+++ b/deepspeed/ops/aio/__init__.py
--- a/deepspeed/ops/csrc
+++ b/deepspeed/ops/csrc
+../../csrc
\ No newline at end of file
--- a/deepspeed/ops/csrc/adagrad/cpu_adagrad.cpp
+++ b/deepspeed/ops/csrc/adagrad/cpu_adagrad.cpp
-#include "cpu_adagrad.h"
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adagrad_Optimizer::Step_1(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float step_size = -1 * _alpha;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = grads[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
-
-                variance += grad * grad;
-
-                grad = sqrt(variance);
-                grad += _eps;
-                grad = momentum / grad;
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                // STORE UPDATE TERM TO GRAD'S MEMORY
-                grads[k] = grad * step_size;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-
-void Adagrad_Optimizer::Step_4(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adagrad_optimizer(int optimizer_id,
-                             float alpha = 1e-2,
-                             float eps = 1e-8,
-                             float weight_decay = 0,
-                             bool should_log = false)
-{
-    auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
-    }
-
-    return 0;
-}
-
-void Adagrad_Optimizer::Step_8(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adagrad_step(int optimizer_id,
-                    size_t step,
-                    float lr,
-                    float epsilon,
-                    float weight_decay,
-                    torch::Tensor& params,
-                    torch::Tensor& grads,
-                    torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int ds_adagrad_step_plus_copy(int optimizer_id,
-                              size_t step,
-                              float lr,
-                              float epsilon,
-                              float weight_decay,
-                              torch::Tensor& params,
-                              torch::Tensor& grads,
-                              torch::Tensor& exp_avg_sq,
-                              torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int destroy_adagrad_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
-    m.def("adagrad_update_copy",
-          &ds_adagrad_step_plus_copy,
-          "DeepSpeed CPU Adagrad update and param copy (C++)");
-    m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
-    m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
-}
--- a/deepspeed/ops/csrc/adagrad/cpu_adagrad_hip.cpp
+++ b/deepspeed/ops/csrc/adagrad/cpu_adagrad_hip.cpp
-// !!! This is a file automatically generated by hipify!!!
-#include "cpu_adagrad_hip.h"
-#include <hip/hip_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
-#include "custom_hip_layers.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adagrad_Optimizer::Step_1(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float step_size = -1 * _alpha;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = grads[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
-
-                variance += grad * grad;
-
-                grad = sqrt(variance);
-                grad += _eps;
-                grad = momentum / grad;
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                // STORE UPDATE TERM TO GRAD'S MEMORY
-                grads[k] = grad * step_size;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-
-void Adagrad_Optimizer::Step_4(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adagrad_optimizer(int optimizer_id,
-                             float alpha = 1e-2,
-                             float eps = 1e-8,
-                             float weight_decay = 0,
-                             bool should_log = false)
-{
-    auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
-    }
-
-    return 0;
-}
-
-void Adagrad_Optimizer::Step_8(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               __half* dev_params,
-                               bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adagrad_step(int optimizer_id,
-                    size_t step,
-                    float lr,
-                    float epsilon,
-                    float weight_decay,
-                    torch::Tensor& params,
-                    torch::Tensor& grads,
-                    torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int ds_adagrad_step_plus_copy(int optimizer_id,
-                              size_t step,
-                              float lr,
-                              float epsilon,
-                              float weight_decay,
-                              torch::Tensor& params,
-                              torch::Tensor& grads,
-                              torch::Tensor& exp_avg_sq,
-                              torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int destroy_adagrad_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
-    m.def("adagrad_update_copy",
-          &ds_adagrad_step_plus_copy,
-          "DeepSpeed CPU Adagrad update and param copy (C++)");
-    m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
-    m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
-}
--- a/deepspeed/ops/csrc/adam/cpu_adam.cpp
+++ b/deepspeed/ops/csrc/adam/cpu_adam.cpp
-#include "cpu_adam.h"
-#include <cuda_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adam_Optimizer::Step_1(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float betta1_minus1 = 1 - _betta1;
-        float betta2_minus1 = 1 - _betta2;
-
-        float step_size = -1 * _alpha / _bias_correction1;
-        float w_decay = -1 * _alpha * _weight_decay;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = _exp_avg[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
-                momentum = momentum * _betta1;
-                momentum = grad * betta1_minus1 + momentum;
-
-                variance = variance * _betta2;
-                grad = grad * grad;
-                variance = grad * betta2_minus1 + variance;
-
-                grad = sqrt(variance);
-                grad = grad * _bias_correction2 + _eps;
-                grad = momentum / grad;
-                if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                _exp_avg[k] = momentum;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-
-void Adam_Optimizer::Step_4(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adam_optimizer(int optimizer_id,
-                          float alpha = 1e-3,
-                          float betta1 = 0.9,
-                          float betta2 = 0.999,
-                          float eps = 1e-8,
-                          float weight_decay = 0,
-                          bool adamw_mode = true,
-                          bool should_log = false)
-{
-    auto opt =
-        std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
-               alpha,
-               betta1,
-               betta2,
-               weight_decay,
-               (int)adamw_mode);
-    }
-
-    return 0;
-}
-
-void Adam_Optimizer::Step_8(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adam_step(int optimizer_id,
-                 size_t step,
-                 float lr,
-                 float beta1,
-                 float beta2,
-                 float epsilon,
-                 float weight_decay,
-                 bool bias_correction,
-                 torch::Tensor& params,
-                 torch::Tensor& grads,
-                 torch::Tensor& exp_avg,
-                 torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-
-    // assert(params.options().dtype() == grads.options().dtype());
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                nullptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int ds_adam_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float epsilon,
-                           float weight_decay,
-                           bool bias_correction,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& exp_avg_sq,
-                           torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int destroy_adam_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
-    m.def("adam_update_copy",
-          &ds_adam_step_plus_copy,
-          "DeepSpeed CPU Adam update and param copy (C++)");
-    m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
-    m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
-}
--- a/deepspeed/ops/csrc/adam/cpu_adam_hip.cpp
+++ b/deepspeed/ops/csrc/adam/cpu_adam_hip.cpp
-// !!! This is a file automatically generated by hipify!!!
-#include "cpu_adam_hip.h"
-#include <hip/hip_runtime_api.h>
-#include <math.h>
-#include <omp.h>
-#include <torch/extension.h>
-#include <iostream>
-#include <memory>
-#include <type_traits>
-#include <unordered_map>
-#include "rocblas.h"
-#include "hip/hip_runtime.h"
-#include "hiprand/hiprand.h"
-#include "custom_hip_layers.h"
-
-static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
-
-// C++ interface
-
-void Adam_Optimizer::Step_1(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size) {
-        float betta1_minus1 = 1 - _betta1;
-        float betta2_minus1 = 1 - _betta2;
-
-        float step_size = -1 * _alpha / _bias_correction1;
-        float w_decay = -1 * _alpha * _weight_decay;
-        __half* grads_cast_h;
-        __half* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<__half*>(grads);
-            params_cast_h = reinterpret_cast<__half*>(_params);
-        }
-
-        for (size_t t = rounded_size; t < _param_size; t += TILE) {
-            size_t copy_size = TILE;
-            if ((t + TILE) > _param_size) copy_size = _param_size - t;
-            size_t offset = copy_size + t;
-            if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
-
-#pragma omp parallel for
-            for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
-                float momentum = _exp_avg[k];
-                float variance = _exp_avg_sq[k];
-                if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
-                momentum = momentum * _betta1;
-                momentum = grad * betta1_minus1 + momentum;
-
-                variance = variance * _betta2;
-                grad = grad * grad;
-                variance = grad * betta2_minus1 + variance;
-
-                grad = sqrt(variance);
-                grad = grad * _bias_correction2 + _eps;
-                grad = momentum / grad;
-                if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
-                param = grad * step_size + param;
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-
-                if (half_precision)
-                    params_cast_h[k] = (__half)param;
-                else
-                    _params[k] = param;
-                _exp_avg[k] = momentum;
-                _exp_avg_sq[k] = variance;
-            }
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-
-                _buf_index = !_buf_index;
-            }
-        }
-    }
-}
-
-void Adam_Optimizer::Step_4(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_1((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int create_adam_optimizer(int optimizer_id,
-                          float alpha = 1e-3,
-                          float betta1 = 0.9,
-                          float betta2 = 0.999,
-                          float eps = 1e-8,
-                          float weight_decay = 0,
-                          bool adamw_mode = true,
-                          bool should_log = false)
-{
-    auto opt =
-        std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
-
-    s_optimizers[optimizer_id] = opt;
-
-    if (should_log) {
-        std::string avx_type = "";
-#if defined(__AVX512__)
-        avx_type = "AVX512";
-#else
-#if defined(__AVX256__)
-        avx_type = "AVX2";
-#else
-        avx_type = "scalar";
-#endif
-#endif
-
-        printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
-               optimizer_id,
-               avx_type.c_str());
-        printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
-               alpha,
-               betta1,
-               betta2,
-               weight_decay,
-               (int)adamw_mode);
-    }
-
-    return 0;
-}
-
-void Adam_Optimizer::Step_8(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            __half* dev_params,
-                            bool half_precision)
-{
-    size_t rounded_size = 0;
-#if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
-#endif
-    if (_param_size > rounded_size)
-        Step_4((_params + rounded_size),
-               (grads + rounded_size),
-               (_exp_avg + rounded_size),
-               (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
-}
-
-int ds_adam_step(int optimizer_id,
-                 size_t step,
-                 float lr,
-                 float beta1,
-                 float beta2,
-                 float epsilon,
-                 float weight_decay,
-                 bool bias_correction,
-                 torch::Tensor& params,
-                 torch::Tensor& grads,
-                 torch::Tensor& exp_avg,
-                 torch::Tensor& exp_avg_sq)
-{
-    auto params_c = params.contiguous();
-    auto grads_c = grads.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-
-    // assert(params.options().dtype() == grads.options().dtype());
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                nullptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int ds_adam_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float epsilon,
-                           float weight_decay,
-                           bool bias_correction,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& exp_avg_sq,
-                           torch::Tensor& gpu_params)
-{
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    __half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.size(0),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-    return 0;
-}
-
-int destroy_adam_optimizer(int optimizer_id)
-{
-    s_optimizers.erase(optimizer_id);
-
-    return 0;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
-    m.def("adam_update_copy",
-          &ds_adam_step_plus_copy,
-          "DeepSpeed CPU Adam update and param copy (C++)");
-    m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
-    m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
-}
--- a/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp
+++ b/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp
-#include <torch/extension.h>
-
-void multi_tensor_adam_cuda(int chunk_size,
-                            at::Tensor noop_flag,
-                            std::vector<std::vector<at::Tensor>> tensor_lists,
-                            const float lr,
-                            const float beta1,
-                            const float beta2,
-                            const float epsilon,
-                            const int step,
-                            const int mode,
-                            const int bias_correction,
-                            const float weight_decay);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("multi_tensor_adam",
-          &multi_tensor_adam_cuda,
-          "Compute and apply gradient update to parameters for Adam optimizer");
-}
--- a/deepspeed/ops/csrc/adam/multi_tensor_adam.cu
+++ b/deepspeed/ops/csrc/adam/multi_tensor_adam.cu
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-// Another possibility:
-// #include <torch/all.h>
-
-#include <assert.h>
-
-#include "multi_tensor_apply.cuh"
-#include "type_shim.h"
-
-#define BLOCK_SIZE 512
-#define ILP 4
-
-typedef enum {
-    ADAM_MODE_0 = 0,  // L2 regularization mode
-    ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
-} adamMode_t;
-
-using MATH_T = float;
-
-template <typename T>
-struct AdamFunctor {
-    __device__ __forceinline__ void operator()(int chunk_size,
-                                               volatile int* noop_gmem,
-                                               TensorListMetadata<4>& tl,
-                                               const float beta1,
-                                               const float beta2,
-                                               const float beta1_correction,
-                                               const float beta2_correction,
-                                               const float epsilon,
-                                               const float lr,
-                                               adamMode_t mode,
-                                               const float decay)
-    {
-        // I'd like this kernel to propagate infs/nans.
-        // if(*noop_gmem == 1)
-        //   return;
-
-        int tensor_loc = tl.block_to_tensor[blockIdx.x];
-
-        // potentially use to pass in list of scalar
-        // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
-
-        int chunk_idx = tl.block_to_chunk[blockIdx.x];
-        int n = tl.sizes[tensor_loc];
-
-        T* g = (T*)tl.addresses[0][tensor_loc];
-        g += chunk_idx * chunk_size;
-
-        T* p = (T*)tl.addresses[1][tensor_loc];
-        p += chunk_idx * chunk_size;
-
-        T* m = (T*)tl.addresses[2][tensor_loc];
-        m += chunk_idx * chunk_size;
-
-        T* v = (T*)tl.addresses[3][tensor_loc];
-        v += chunk_idx * chunk_size;
-
-        n -= chunk_idx * chunk_size;
-
-        // see note in multi_tensor_scale_kernel.cu
-        for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
-            MATH_T r_g[ILP];
-            MATH_T r_p[ILP];
-            MATH_T r_m[ILP];
-            MATH_T r_v[ILP];
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                int i = i_start + threadIdx.x + ii * blockDim.x;
-                if (i < n && i < chunk_size) {
-                    r_g[ii] = g[i];
-                    r_p[ii] = p[i];
-                    r_m[ii] = m[i];
-                    r_v[ii] = v[i];
-                } else {
-                    r_g[ii] = MATH_T(0);
-                    r_p[ii] = MATH_T(0);
-                    r_m[ii] = MATH_T(0);
-                    r_v[ii] = MATH_T(0);
-                }
-            }
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                if (mode == ADAM_MODE_0) {  // L2
-                    r_g[ii] = r_g[ii] + (decay * r_p[ii]);
-                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
-                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
-                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-                    MATH_T update = next_m_unbiased / denom;
-                    r_p[ii] = r_p[ii] - (lr * update);
-                } else {  // weight decay
-                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
-                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
-                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-                    MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
-                    r_p[ii] = r_p[ii] - (lr * update);
-                }
-            }
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                int i = i_start + threadIdx.x + ii * blockDim.x;
-                if (i < n && i < chunk_size) {
-                    p[i] = r_p[ii];
-                    m[i] = r_m[ii];
-                    v[i] = r_v[ii];
-                }
-            }
-        }
-    }
-};
-
-void multi_tensor_adam_cuda(int chunk_size,
-                            at::Tensor noop_flag,
-                            std::vector<std::vector<at::Tensor>> tensor_lists,
-                            const float lr,
-                            const float beta1,
-                            const float beta2,
-                            const float epsilon,
-                            const int step,
-                            const int mode,
-                            const int bias_correction,
-                            const float weight_decay)
-{
-    using namespace at;
-
-    // Handle bias correction mode
-    float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
-    if (bias_correction == 1) {
-        bias_correction1 = 1 - std::pow(beta1, step);
-        bias_correction2 = 1 - std::pow(beta2, step);
-    }
-
-    // Assume single type across p,g,m1,m2 now
-    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
-                                   0,
-                                   "adam",
-                                   multi_tensor_apply<4>(BLOCK_SIZE,
-                                                         chunk_size,
-                                                         noop_flag,
-                                                         tensor_lists,
-                                                         AdamFunctor<scalar_t_0>(),
-                                                         beta1,
-                                                         beta2,
-                                                         bias_correction1,
-                                                         bias_correction2,
-                                                         epsilon,
-                                                         lr,
-                                                         (adamMode_t)mode,
-                                                         weight_decay);)
-
-    AT_CUDA_CHECK(cudaGetLastError());
-}
--- a/deepspeed/ops/csrc/adam/multi_tensor_adam.hip
+++ b/deepspeed/ops/csrc/adam/multi_tensor_adam.hip
-// !!! This is a file automatically generated by hipify!!!
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/Exceptions.h>
-// Another possibility:
-// #include <torch/all.h>
-
-#include <assert.h>
-
-#include "multi_tensor_apply_hip.cuh"
-#include "type_shim_hip.h"
-
-#define BLOCK_SIZE 512
-#define ILP 4
-
-typedef enum {
-    ADAM_MODE_0 = 0,  // L2 regularization mode
-    ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
-} adamMode_t;
-
-using MATH_T = float;
-
-template <typename T>
-struct AdamFunctor {
-    __device__ __forceinline__ void operator()(int chunk_size,
-                                               volatile int* noop_gmem,
-                                               TensorListMetadata<4>& tl,
-                                               const float beta1,
-                                               const float beta2,
-                                               const float beta1_correction,
-                                               const float beta2_correction,
-                                               const float epsilon,
-                                               const float lr,
-                                               adamMode_t mode,
-                                               const float decay)
-    {
-        // I'd like this kernel to propagate infs/nans.
-        // if(*noop_gmem == 1)
-        //   return;
-
-        int tensor_loc = tl.block_to_tensor[blockIdx.x];
-
-        // potentially use to pass in list of scalar
-        // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
-
-        int chunk_idx = tl.block_to_chunk[blockIdx.x];
-        int n = tl.sizes[tensor_loc];
-
-        T* g = (T*)tl.addresses[0][tensor_loc];
-        g += chunk_idx * chunk_size;
-
-        T* p = (T*)tl.addresses[1][tensor_loc];
-        p += chunk_idx * chunk_size;
-
-        T* m = (T*)tl.addresses[2][tensor_loc];
-        m += chunk_idx * chunk_size;
-
-        T* v = (T*)tl.addresses[3][tensor_loc];
-        v += chunk_idx * chunk_size;
-
-        n -= chunk_idx * chunk_size;
-
-        // see note in multi_tensor_scale_kernel.cu
-        for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
-            MATH_T r_g[ILP];
-            MATH_T r_p[ILP];
-            MATH_T r_m[ILP];
-            MATH_T r_v[ILP];
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                int i = i_start + threadIdx.x + ii * blockDim.x;
-                if (i < n && i < chunk_size) {
-                    r_g[ii] = g[i];
-                    r_p[ii] = p[i];
-                    r_m[ii] = m[i];
-                    r_v[ii] = v[i];
-                } else {
-                    r_g[ii] = MATH_T(0);
-                    r_p[ii] = MATH_T(0);
-                    r_m[ii] = MATH_T(0);
-                    r_v[ii] = MATH_T(0);
-                }
-            }
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                if (mode == ADAM_MODE_0) {  // L2
-                    r_g[ii] = r_g[ii] + (decay * r_p[ii]);
-                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
-                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
-                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-                    MATH_T update = next_m_unbiased / denom;
-                    r_p[ii] = r_p[ii] - (lr * update);
-                } else {  // weight decay
-                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
-                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
-                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
-                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
-                    MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
-                    MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
-                    r_p[ii] = r_p[ii] - (lr * update);
-                }
-            }
-#pragma unroll
-            for (int ii = 0; ii < ILP; ii++) {
-                int i = i_start + threadIdx.x + ii * blockDim.x;
-                if (i < n && i < chunk_size) {
-                    p[i] = r_p[ii];
-                    m[i] = r_m[ii];
-                    v[i] = r_v[ii];
-                }
-            }
-        }
-    }
-};
-
-void multi_tensor_adam_cuda(int chunk_size,
-                            at::Tensor noop_flag,
-                            std::vector<std::vector<at::Tensor>> tensor_lists,
-                            const float lr,
-                            const float beta1,
-                            const float beta2,
-                            const float epsilon,
-                            const int step,
-                            const int mode,
-                            const int bias_correction,
-                            const float weight_decay)
-{
-    using namespace at;
-
-    // Handle bias correction mode
-    float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
-    if (bias_correction == 1) {
-        bias_correction1 = 1 - ::pow(beta1, step);
-        bias_correction2 = 1 - ::pow(beta2, step);
-    }
-
-    // Assume single type across p,g,m1,m2 now
-    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
-                                   0,
-                                   "adam",
-                                   multi_tensor_apply<4>(BLOCK_SIZE,
-                                                         chunk_size,
-                                                         noop_flag,
-                                                         tensor_lists,
-                                                         AdamFunctor<scalar_t_0>(),
-                                                         beta1,
-                                                         beta2,
-                                                         bias_correction1,
-                                                         bias_correction2,
-                                                         epsilon,
-                                                         lr,
-                                                         (adamMode_t)mode,
-                                                         weight_decay);)
-
-    AT_CUDA_CHECK(hipGetLastError());
-}
--- a/deepspeed/ops/csrc/adam/multi_tensor_apply.cuh
+++ b/deepspeed/ops/csrc/adam/multi_tensor_apply.cuh
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include "compat.h"
-
-#include <assert.h>
-
-// #include <iostream>
-
-// This header is the one-stop shop for all your multi-tensor apply needs.
-
-// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
-constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
-constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
-
-template <int n>
-struct TensorListMetadata {
-    void* addresses[n][depth_to_max_tensors[n - 1]];
-    int sizes[depth_to_max_tensors[n - 1]];
-    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-    int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
-    int start_tensor_this_launch;
-};
-
-template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(int chunk_size,
-                                          volatile int* noop_flag,
-                                          T tl,
-                                          U callable,
-                                          ArgTypes... args)
-{
-    // Hand the chunk information to the user-supplied functor to process however it likes.
-    callable(chunk_size, noop_flag, tl, args...);
-}
-
-template <int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply(int block_size,
-                        int chunk_size,
-                        const at::Tensor& noop_flag,
-                        const std::vector<std::vector<at::Tensor>>& tensor_lists,
-                        T callable,
-                        ArgTypes... args)
-{
-    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
-    int len0 = tensor_lists[0].size();
-    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
-    auto ref_device = tensor_lists[0][0].device();
-    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
-    for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
-    {
-        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
-        for (int t = 0; t < tensor_lists[l].size(); t++) {
-            // TODO:  Print which tensor fails.
-            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
-#ifdef VERSION_GE_1_5
-            contiguous_memory = (contiguous_memory ||
-                                 tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
-#endif
-            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
-            TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
-                        "A tensor was not on the same device as the first tensor");
-            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
-        }
-    }
-
-    int ntensors = tensor_lists[0].size();
-
-    TensorListMetadata<depth> tl;
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
-    auto stream = at::cuda::getCurrentCUDAStream();
-
-    tl.start_tensor_this_launch = 0;
-    int loc_block_info = 0;
-    int loc_tensor_info = 0;
-    for (int t = 0; t < ntensors; t++) {
-        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
-        for (int d = 0; d < depth; d++)
-            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
-        loc_tensor_info++;
-
-        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
-
-        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
-            // std::cout << chunks_this_tensor << std::endl;
-            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-            tl.block_to_chunk[loc_block_info] = chunk;
-            loc_block_info++;
-
-            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
-                                 chunk == chunks_this_tensor - 1);
-            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
-            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
-            if (tensors_full || blocks_full || last_chunk) {
-                // using accscalar_t = acc_type<scalar_t, true>;
-                multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
-                    chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
-
-                AT_CUDA_CHECK(cudaGetLastError());
-
-                // Reset.  The control flow possibilities here make my brain hurt.
-                loc_block_info = 0;
-                if (chunk == chunks_this_tensor - 1) {
-                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
-                    // std::endl;
-                    loc_tensor_info = 0;
-                    tl.start_tensor_this_launch = t + 1;
-                } else {
-                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
-                    // std::endl;
-                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
-                    for (int d = 0; d < depth; d++)
-                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
-                    loc_tensor_info = 1;
-                    tl.start_tensor_this_launch = t;
-                }
-            }
-        }
-    }
-}
--- a/deepspeed/ops/csrc/adam/multi_tensor_apply_hip.cuh
+++ b/deepspeed/ops/csrc/adam/multi_tensor_apply_hip.cuh
-// !!! This is a file automatically generated by hipify!!!
-#include "hip/hip_runtime.h"
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/Exceptions.h>
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-#include "compat.h"
-
-#include <assert.h>
-
-// #include <iostream>
-
-// This header is the one-stop shop for all your multi-tensor apply needs.
-
-// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
-constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
-constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
-
-template <int n>
-struct TensorListMetadata {
-    void* addresses[n][depth_to_max_tensors[n - 1]];
-    int sizes[depth_to_max_tensors[n - 1]];
-    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-    int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
-    int start_tensor_this_launch;
-};
-
-template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(int chunk_size,
-                                          volatile int* noop_flag,
-                                          T tl,
-                                          U callable,
-                                          ArgTypes... args)
-{
-    // Hand the chunk information to the user-supplied functor to process however it likes.
-    callable(chunk_size, noop_flag, tl, args...);
-}
-
-template <int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply(int block_size,
-                        int chunk_size,
-                        const at::Tensor& noop_flag,
-                        const std::vector<std::vector<at::Tensor>>& tensor_lists,
-                        T callable,
-                        ArgTypes... args)
-{
-    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
-    int len0 = tensor_lists[0].size();
-    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
-    auto ref_device = tensor_lists[0][0].device();
-    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
-    for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
-    {
-        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
-        for (int t = 0; t < tensor_lists[l].size(); t++) {
-            // TODO:  Print which tensor fails.
-            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
-#ifdef VERSION_GE_1_5
-            contiguous_memory = (contiguous_memory ||
-                                 tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
-#endif
-            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
-            TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
-                        "A tensor was not on the same device as the first tensor");
-            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
-        }
-    }
-
-    int ntensors = tensor_lists[0].size();
-
-    TensorListMetadata<depth> tl;
-
-    const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(tensor_lists[0][0]));
-    auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-
-    tl.start_tensor_this_launch = 0;
-    int loc_block_info = 0;
-    int loc_tensor_info = 0;
-    for (int t = 0; t < ntensors; t++) {
-        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
-        for (int d = 0; d < depth; d++)
-            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
-        loc_tensor_info++;
-
-        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
-
-        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
-            // std::cout << chunks_this_tensor << std::endl;
-            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-            tl.block_to_chunk[loc_block_info] = chunk;
-            loc_block_info++;
-
-            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
-                                 chunk == chunks_this_tensor - 1);
-            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
-            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
-            if (tensors_full || blocks_full || last_chunk) {
-                // using accscalar_t = acc_type<scalar_t, true>;
-               hipLaunchKernelGGL(( multi_tensor_apply_kernel), dim3(loc_block_info), dim3(block_size), 0, stream, 
-                    chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
-
-                AT_CUDA_CHECK(hipGetLastError());
-
-                // Reset.  The control flow possibilities here make my brain hurt.
-                loc_block_info = 0;
-                if (chunk == chunks_this_tensor - 1) {
-                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
-                    // std::endl;
-                    loc_tensor_info = 0;
-                    tl.start_tensor_this_launch = t + 1;
-                } else {
-                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
-                    // std::endl;
-                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
-                    for (int d = 0; d < depth; d++)
-                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
-                    loc_tensor_info = 1;
-                    tl.start_tensor_this_launch = t;
-                }
-            }
-        }
-    }
-}
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/deepspeed/ops/csrc/aio/common/deepspeed_aio_common.cpp
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <libaio.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "deepspeed_aio_common.h"
-
-using namespace std;
-using namespace std::chrono;
-
-#define DEBUG_DS_AIO_PERF 0
-#define DEBUG_DS_AIO_SUBMIT_PERF 0
-
-static const std::string c_library_name = "deepspeed_aio";
-
-static void _report_aio_statistics(const char* tag,
-                                   const std::vector<std::chrono::duration<double>>& latencies)
-    __attribute__((unused));
-
-static void _report_aio_statistics(const char* tag,
-                                   const std::vector<std::chrono::duration<double>>& latencies)
-{
-    std::vector<double> lat_usec;
-    for (auto& lat : latencies) { lat_usec.push_back(lat.count() * 1e6); }
-    const auto min_lat = *(std::min_element(lat_usec.begin(), lat_usec.end()));
-    const auto max_lat = *(std::max_element(lat_usec.begin(), lat_usec.end()));
-    const auto avg_lat = std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
-
-    std::cout << c_library_name << ": latency statistics(usec) " << tag
-              << " min/max/avg = " << min_lat << " " << max_lat << " " << avg_lat << std::endl;
-}
-
-static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_latencies,
-                               struct deepspeed_aio_latency_t& summary_latencies)
-{
-    std::vector<double> lat_usec;
-    for (auto& lat : raw_latencies) { lat_usec.push_back(lat.count() * 1e6); }
-    summary_latencies._min_usec = *(std::min_element(lat_usec.begin(), lat_usec.end()));
-    summary_latencies._max_usec = *(std::max_element(lat_usec.begin(), lat_usec.end()));
-    summary_latencies._avg_usec =
-        std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
-}
-
-static void _do_io_submit_singles(const long long int n_iocbs,
-                                  const long long int iocb_index,
-                                  std::unique_ptr<aio_context>& aio_ctxt,
-                                  std::vector<std::chrono::duration<double>>& submit_times)
-{
-    for (auto i = 0; i < n_iocbs; ++i) {
-        const auto st = std::chrono::high_resolution_clock::now();
-        const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, 1, aio_ctxt->_iocbs.data() + i);
-        submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
-#if DEBUG_DS_AIO_SUBMIT_PERF
-        printf("submit(usec) %f io_index=%lld buf=%p len=%lu off=%llu \n",
-               submit_times.back().count() * 1e6,
-               iocb_index,
-               aio_ctxt->_iocbs[i]->u.c.buf,
-               aio_ctxt->_iocbs[i]->u.c.nbytes,
-               aio_ctxt->_iocbs[i]->u.c.offset);
-#endif
-        assert(submit_ret > 0);
-    }
-}
-
-static void _do_io_submit_block(const long long int n_iocbs,
-                                const long long int iocb_index,
-                                std::unique_ptr<aio_context>& aio_ctxt,
-                                std::vector<std::chrono::duration<double>>& submit_times)
-{
-    const auto st = std::chrono::high_resolution_clock::now();
-    const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, n_iocbs, aio_ctxt->_iocbs.data());
-    submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
-#if DEBUG_DS_AIO_SUBMIT_PERF
-    printf("submit(usec) %f io_index=%lld nr=%lld buf=%p len=%lu off=%llu \n",
-           submit_times.back().count() * 1e6,
-           iocb_index,
-           n_iocbs,
-           aio_ctxt->_iocbs[0]->u.c.buf,
-           aio_ctxt->_iocbs[0]->u.c.nbytes,
-           aio_ctxt->_iocbs[0]->u.c.offset);
-#endif
-    assert(submit_ret > 0);
-}
-
-static int _do_io_complete(const long long int min_completes,
-                           const long long int max_completes,
-                           std::unique_ptr<aio_context>& aio_ctxt,
-                           std::vector<std::chrono::duration<double>>& reap_times)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-    const auto n_completes = io_getevents(
-        aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
-    reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
-
-    assert(n_completes >= min_completes);
-    return n_completes;
-}
-
-void do_aio_operation_sequential(const bool read_op,
-                                 std::unique_ptr<aio_context>& aio_ctxt,
-                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                 deepspeed_aio_config_t* config,
-                                 deepspeed_aio_perf_t* perf)
-{
-    struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
-
-    const auto num_io_blocks = static_cast<long long int>(
-        ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
-#if DEBUG_DS_AIO_PERF
-    const auto io_op_name = std::string(read_op ? "read" : "write");
-    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes with " << num_io_blocks << " io blocks" << std::endl;
-#endif
-
-    std::vector<std::chrono::duration<double>> submit_times;
-    std::vector<std::chrono::duration<double>> reap_times;
-    const auto max_queue_bytes =
-        static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for (long long iocb_index = 0; iocb_index < num_io_blocks;
-         iocb_index += aio_ctxt->_queue_depth) {
-        const auto start_offset = iocb_index * aio_ctxt->_block_size;
-        const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
-        const auto n_iocbs =
-            min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
-        const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
-        prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
-
-        if (config->_single_submit) {
-            _do_io_submit_singles(n_iocbs, iocb_index, aio_ctxt, submit_times);
-        } else {
-            _do_io_submit_block(n_iocbs, iocb_index, aio_ctxt, submit_times);
-        }
-
-        _do_io_complete(n_iocbs, n_iocbs, aio_ctxt, reap_times);
-    }
-    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
-
-    if (perf) {
-        _get_aio_latencies(submit_times, perf->_submit);
-        _get_aio_latencies(reap_times, perf->_complete);
-        perf->_e2e_usec = elapsed.count() * 1e6;
-        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
-    }
-
-#if DEBUG_DS_AIO_PERF
-    _report_aio_statistics("submit", submit_times);
-    _report_aio_statistics("complete", reap_times);
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
-              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes " << std::endl;
-#endif
-}
-
-void do_aio_operation_overlap(const bool read_op,
-                              std::unique_ptr<aio_context>& aio_ctxt,
-                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                              deepspeed_aio_config_t* config,
-                              deepspeed_aio_perf_t* perf)
-{
-    struct io_prep_generator io_gen(read_op, xfer_ctxt, aio_ctxt->_block_size);
-
-#if DEBUG_DS_AIO_PERF
-    const auto io_op_name = std::string(read_op ? "read" : "write");
-    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes with " << io_gen._num_io_blocks << " io blocks" << std::endl;
-#endif
-
-    std::vector<std::chrono::duration<double>> submit_times;
-    std::vector<std::chrono::duration<double>> reap_times;
-
-    auto request_iocbs = aio_ctxt->_queue_depth;
-    auto n_pending_iocbs = 0;
-    const auto min_completes = 1;
-    auto start = std::chrono::high_resolution_clock::now();
-    while (true) {
-        const auto n_iocbs = io_gen.prep_iocbs(request_iocbs - n_pending_iocbs, &aio_ctxt->_iocbs);
-        if (n_iocbs > 0) {
-            if (config->_single_submit) {
-                _do_io_submit_singles(
-                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
-            } else {
-                _do_io_submit_block(
-                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
-            }
-        }
-
-        n_pending_iocbs += n_iocbs;
-        assert(n_pending_iocbs <= aio_ctxt->_queue_depth);
-
-        if (n_pending_iocbs == 0) { break; }
-
-        const auto n_complete =
-            _do_io_complete(min_completes, n_pending_iocbs, aio_ctxt, reap_times);
-        n_pending_iocbs -= n_complete;
-    }
-
-    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
-
-    if (perf) {
-        _get_aio_latencies(submit_times, perf->_submit);
-        _get_aio_latencies(reap_times, perf->_complete);
-        perf->_e2e_usec = elapsed.count() * 1e6;
-        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
-    }
-
-#if DEBUG_DS_AIO_PERF
-    _report_aio_statistics("submit", submit_times);
-    _report_aio_statistics("complete", reap_times);
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
-              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
-#endif
-
-#if DEBUG_DS_AIO_PERF
-    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
-              << " bytes " << std::endl;
-#endif
-}
-
-void report_file_error(const char* filename, const std::string file_op, const int error_code)
-{
-    std::string err_msg = file_op + std::string(" failed on ") + std::string(filename) +
-                          " error = " + std::to_string(error_code);
-    std::cerr << c_library_name << ":  " << err_msg << std::endl;
-}
-
-int open_file(const char* filename, const bool read_op)
-{
-    const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
-    const int mode = 0600;
-    const auto fd = open(filename, flags, mode);
-    if (fd == -1) {
-        const auto error_code = errno;
-        const auto error_msg = read_op ? " open for read " : " open for write ";
-        report_file_error(filename, error_msg, error_code);
-        return -1;
-    }
-    return fd;
-}
-
-int regular_read(const char* filename, std::vector<char>& buffer)
-{
-    long long int num_bytes;
-    const auto f_size = get_file_size(filename, num_bytes);
-    assert(f_size != -1);
-    buffer.resize(num_bytes);
-    const auto fd = open(filename, O_RDONLY, 0600);
-    assert(fd != -1);
-    long long int read_bytes = 0;
-    auto r = 0;
-    do {
-        const auto buffer_ptr = buffer.data() + read_bytes;
-        const auto bytes_to_read = num_bytes - read_bytes;
-        r = read(fd, buffer_ptr, bytes_to_read);
-        read_bytes += r;
-    } while (r > 0);
-
-    if (read_bytes != num_bytes) {
-        std::cerr << "read error "
-                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
-                  << std::endl;
-    }
-    assert(read_bytes == num_bytes);
-    close(fd);
-    return 0;
-}
-
-static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
-{
-    std::vector<char> regular_buffer;
-    const auto reg_ret = regular_read(filename, regular_buffer);
-    assert(0 == reg_ret);
-    std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
-              << std::endl;
-
-    if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
-
-    return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
-}
-
-bool validate_aio_operation(const bool read_op,
-                            const char* filename,
-                            void* aio_buffer,
-                            const long long int num_bytes)
-{
-    const auto msg_suffix = std::string("deepspeed_aio_") +
-                            std::string(read_op ? "read()" : "write()") +
-                            std::string("using read()");
-
-    if (false == _validate_buffer(filename, aio_buffer, num_bytes)) {
-        std::cout << "Fail: correctness of " << msg_suffix << std::endl;
-        return false;
-    }
-
-    std::cout << "Pass: correctness of  " << msg_suffix << std::endl;
-    return true;
-}
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_common.h
+++ b/deepspeed/ops/csrc/aio/common/deepspeed_aio_common.h
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <deepspeed_aio_utils.h>
-#include <stdlib.h>
-#include <memory>
-#include <string>
-
-using namespace std;
-
-void do_aio_operation_sequential(const bool read_op,
-                                 std::unique_ptr<aio_context>& aio_ctxt,
-                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                 deepspeed_aio_config_t* config,
-                                 deepspeed_aio_perf_t* perf);
-
-void do_aio_operation_overlap(const bool read_op,
-                              std::unique_ptr<aio_context>& aio_ctxt,
-                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                              deepspeed_aio_config_t* config,
-                              deepspeed_aio_perf_t* perf);
-
-int open_file(const char* filename, const bool read_op);
-
-void report_file_error(const char* filename, const std::string file_op, const int error_code);
-
-int regular_read(const char* filename, std::vector<char>& buffer);
-
-bool validate_aio_operation(const bool read_op,
-                            const char* filename,
-                            void* aio_buffer,
-                            const long long int num_bytes);