push dsv0.8.2 version

67ea635f · aiss · 1b2721ad · 67ea635f · 67ea635f · 67ea635f
Commit 67ea635f authored Mar 30, 2023 by aiss
20 changed files
--- a/csrc/includes/conversion_utils.h
+++ b/csrc/includes/conversion_utils.h
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#pragma once
+
+#include "ds_kernel_utils.h"
+
+#include <cuda_fp16.h>
+#include <stdint.h>
+
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
+
+namespace conversion {
+
+// Basic primitive for constructing conversions
+template <typename TO, typename FROM>
+DS_D_INLINE TO to(FROM val)
+{
+    return to(val);
+}
+
+// Specializations
+
+/********************* Identity Conversions *********************/
+/*
+Identity conversions are useful in templated functions where we might have
+a fixed destination type. For example, I might have a kernel that accepts
+__half, __nv_bfloat16, and float but always want to do the core computation
+at floating point:
+
+T mem_value = input[idx];
+float compute_value = conversion::to<float, T>(mem_value);
+
+In practice, we should be able to elide the second template parameter:
+float compute_val = conversion::to<float>(mem_value);
+
+In this case, we need an implementation to handle the T = float case
+
+NOTE: The type inferencing system appears to be unable to handle inferring the first
+template parameter, even in the trivial case.
+*/
+
+// Floating point types
+template <>
+DS_D_INLINE double to(double val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE float to(float val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE __half to(__half val)
+{
+    return val;
+}
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat16 to(__nv_bfloat16 val)
+{
+    return val;
+}
+#endif
+
+// Integer types
+template <>
+DS_D_INLINE int8_t to(int8_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint8_t to(uint8_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE int16_t to(int16_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint16_t to(uint16_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE int32_t to(int32_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint32_t to(uint32_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE int64_t to(int64_t val)
+{
+    return val;
+}
+template <>
+DS_D_INLINE uint64_t to(uint64_t val)
+{
+    return val;
+}
+
+// TODO: evaluate if we want bools
+
+/*********************  To Double Conversions *********************/
+
+// * to double variants
+
+// Would normally like to not use C cast, but this is an important enough conversion
+// to keep
+template <>
+DS_D_INLINE double to(float val)
+{
+#ifdef PTX_AVAILABLE
+    double ret_val;
+    asm("ctv.rn.f64.f32 %0, %1;\n" : "=d"(ret_val) : "f"(val));
+    return ret_val;
+#else
+    return double(val);
+#endif
+}
+// Note: there is a CVT instruction for __half -> double, but there's no inline interface
+// for passing a single half value
+template <>
+DS_D_INLINE double to(__half val)
+{
+    return to<double>(__half2float(val));
+}
+template <>
+DS_D_INLINE double to(int64_t val)
+{
+    return __ll2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(int32_t val)
+{
+    return __int2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(int16_t val)
+{
+    return __int2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(int8_t val)
+{
+    return __int2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint64_t val)
+{
+    return __ull2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint32_t val)
+{
+    return __uint2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint16_t val)
+{
+    return __uint2double_rn(val);
+}
+template <>
+DS_D_INLINE double to(uint8_t val)
+{
+    return __uint2double_rn(val);
+}
+
+// Same applies here
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE double to(__nv_bfloat16 val)
+{
+    return to<double>(__bfloat162float(val));
+}
+#endif
+
+/*********************  To Float Conversions *********************/
+
+template <>
+DS_D_INLINE float to(double val)
+{
+    return __double2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(__half val)
+{
+    return __half2float(val);
+}
+template <>
+DS_D_INLINE float to(int64_t val)
+{
+    return __ll2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(int32_t val)
+{
+    return __int2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(int16_t val)
+{
+    return __int2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(int8_t val)
+{
+    return __int2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint64_t val)
+{
+    return __ull2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint32_t val)
+{
+    return __uint2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint16_t val)
+{
+    return __uint2float_rn(val);
+}
+template <>
+DS_D_INLINE float to(uint8_t val)
+{
+    return __uint2float_rn(val);
+}
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE float to(__nv_bfloat16 val)
+{
+    return __bfloat162float(val);
+}
+#endif
+
+/*********************  To Float2 Conversions *********************/
+template <>
+DS_D_INLINE float2 to(__half2 val)
+{
+    return __half22float2(val);
+}
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE float2 to(__nv_bfloat162 val)
+{
+    return __bfloat1622float2(val);
+}
+#endif
+
+/*********************  To Half Conversions *********************/
+//aiss
+//template <>
+//DS_D_INLINE __half to(double val)
+//{
+//    return __double2half(val);
+//}
+template <>
+DS_D_INLINE __half to(float val)
+{
+    return __float2half(val);
+}
+template <>
+DS_D_INLINE __half to(int64_t val)
+{
+    return __ll2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(int32_t val)
+{
+    return __int2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(int16_t val)
+{
+    return __short2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(int8_t val)
+{
+    return __int2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint64_t val)
+{
+    return __ull2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint32_t val)
+{
+    return __uint2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint16_t val)
+{
+    return __ushort2half_rn(val);
+}
+template <>
+DS_D_INLINE __half to(uint8_t val)
+{
+    return __uint2half_rn(val);
+}
+
+#ifdef BF16_AVAILABLE
+// No direct conversion
+template <>
+DS_D_INLINE __half to(__nv_bfloat16 val)
+{
+    return to<__half>(to<float>(val));
+}
+#endif
+
+/*********************  To Half2 Conversions *********************/
+template <>
+DS_D_INLINE __half2 to(float2 val)
+{
+    return __float22half2_rn(val);
+}
+
+#ifdef BF16_AVAILABLE
+// No direct conversion
+template <>
+DS_D_INLINE __half2 to(__nv_bfloat162 val)
+{
+    return to<__half2>(to<float2>(val));
+}
+#endif
+
+/*********************  To BF16 Conversions *********************/
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat16 to(double val)
+{
+    return __double2bfloat16(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(float val)
+{
+    return __float2bfloat16(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int64_t val)
+{
+    return __ll2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int32_t val)
+{
+    return __int2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int16_t val)
+{
+    return __short2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(int8_t val)
+{
+    return __int2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint64_t val)
+{
+    return __ull2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint32_t val)
+{
+    return __uint2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint16_t val)
+{
+    return __ushort2bfloat16_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat16 to(uint8_t val)
+{
+    return __uint2bfloat16_rn(val);
+}
+#endif
+
+/*********************  To BF162 Conversions *********************/
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat162 to(float2 val)
+{
+    return __float22bfloat162_rn(val);
+}
+template <>
+DS_D_INLINE __nv_bfloat162 to(__half2 val)
+{
+    return to<__nv_bfloat162>(to<float2>(val));
+}
+#endif
+
+/*********************  To INT64_T Conversions *********************/
+template <>
+DS_D_INLINE int64_t to(double val)
+{
+    return __double2ll_rn(val);
+}
+template <>
+DS_D_INLINE int64_t to(float val)
+{
+    return __float2ll_rn(val);
+}
+template <>
+DS_D_INLINE int64_t to(__half val)
+{
+    return __half2ll_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int64_t to(__nv_bfloat16 val)
+{
+    return __bfloat162ll_rn(val);
+}
+#endif
+
+/*********************  To INT32_T Conversions *********************/
+template <>
+DS_D_INLINE int32_t to(double val)
+{
+    return __double2int_rn(val);
+}
+template <>
+DS_D_INLINE int32_t to(float val)
+{
+    return __float2int_rn(val);
+}
+template <>
+DS_D_INLINE int32_t to(__half val)
+{
+    return __half2int_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int32_t to(__nv_bfloat16 val)
+{
+    return __bfloat162int_rn(val);
+}
+#endif
+
+/*********************  To INT16_T Conversions *********************/
+template <>
+DS_D_INLINE int16_t to(double val)
+{
+    return __double2int_rn(val);
+}
+template <>
+DS_D_INLINE int16_t to(float val)
+{
+    return __float2int_rn(val);
+}
+template <>
+DS_D_INLINE int16_t to(__half val)
+{
+    return __half2int_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int16_t to(__nv_bfloat16 val)
+{
+    return __bfloat162int_rn(val);
+}
+#endif
+
+/*********************  To INT8_T Conversions *********************/
+template <>
+DS_D_INLINE int8_t to(double val)
+{
+    return __double2int_rn(val);
+}
+template <>
+DS_D_INLINE int8_t to(float val)
+{
+    return __float2int_rn(val);
+}
+template <>
+DS_D_INLINE int8_t to(__half val)
+{
+    return __half2int_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE int8_t to(__nv_bfloat16 val)
+{
+    return __bfloat162int_rn(val);
+}
+#endif
+
+/*********************  To UINT64_T Conversions *********************/
+template <>
+DS_D_INLINE uint64_t to(double val)
+{
+    return __double2ull_rn(val);
+}
+template <>
+DS_D_INLINE uint64_t to(float val)
+{
+    return __float2ull_rn(val);
+}
+template <>
+DS_D_INLINE uint64_t to(__half val)
+{
+    return __half2ull_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint64_t to(__nv_bfloat16 val)
+{
+    return __bfloat162ull_rn(val);
+}
+#endif
+
+/*********************  To UINT32_T Conversions *********************/
+template <>
+DS_D_INLINE uint32_t to(double val)
+{
+    return __double2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint32_t to(float val)
+{
+    return __float2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint32_t to(__half val)
+{
+    return __half2uint_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint32_t to(__nv_bfloat16 val)
+{
+    return __bfloat162uint_rn(val);
+}
+#endif
+
+/*********************  To UINT16_T Conversions *********************/
+template <>
+DS_D_INLINE uint16_t to(double val)
+{
+    return __double2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint16_t to(float val)
+{
+    return __float2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint16_t to(__half val)
+{
+    return __half2uint_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint16_t to(__nv_bfloat16 val)
+{
+    return __bfloat162uint_rn(val);
+}
+#endif
+
+/*********************  To UINT8_T Conversions *********************/
+template <>
+DS_D_INLINE uint8_t to(double val)
+{
+    return __double2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint8_t to(float val)
+{
+    return __float2uint_rn(val);
+}
+template <>
+DS_D_INLINE uint8_t to(__half val)
+{
+    return __half2uint_rn(val);
+}
+// No direct support for integer casts at the C++ level and I don't feel they're so important
+// to demand an PTX at this time
+
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE uint8_t to(__nv_bfloat16 val)
+{
+    return __bfloat162uint_rn(val);
+}
+#endif
+
+}  // namespace conversion
--- a/csrc/includes/cpu_adagrad.h
+++ b/csrc/includes/cpu_adagrad.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once

 #define NOMINMAX  // Windows idiosyncrasy
                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c

-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
 #include <stdio.h>
 #include <cassert>
+#include "simd.h"
+
+#if defined(__ENABLE_CUDA__)
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
 #include "cuda.h"
 #include "custom_cuda_layers.h"
-#include "simd.h"
+typedef __half ds_half_precision_t;
+#else
+typedef unsigned short ds_half_precision_t;
+#endif

-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
+#define STEP(SPAN)                                             \
+    void Step_##SPAN(float* _params,                           \
+                     float* grads,                             \
+                     float* _exp_avg_sq,                       \
+                     size_t _param_size,                       \
+                     ds_half_precision_t* dev_param = nullptr, \
                     bool half_precision = false);

 class Adagrad_Optimizer {
 public:
    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
-        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
+        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay)
    {
+#if defined(__ENABLE_CUDA__)
        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));

        _streams[0] = Context::Instance().GetCurrentStream();
        _streams[1] = Context::Instance().GetNewStream();
+        _buf_index = false;
+#endif
    }
    ~Adagrad_Optimizer()
    {
+#if defined(__ENABLE_CUDA__)
        cudaFreeHost(_doubled_buffer[0]);
        cudaFreeHost(_doubled_buffer[1]);
+#endif
    }
 #if defined(__AVX512__) or defined(__AVX256__)
    template <int span>
@@ -42,16 +57,18 @@ public:
                  float* grads,
                  float* _exp_avg_sq,
                  size_t param_size,
-                  __half* dev_param = nullptr,
+                  ds_half_precision_t* dev_param = nullptr,
                  bool half_precision = false);
 #endif
    STEP(1)
    STEP(4)
    STEP(8)
+#if defined(__ENABLE_CUDA__)
    inline void SynchronizeStreams()
    {
        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
    }
+#endif
    inline void IncrementStep(size_t step)
    {
        _step++;
@@ -73,10 +90,11 @@ private:
    float _betta2_t;
    size_t _step;

-    float* _doubled_buffer[2];
+#if defined(__ENABLE_CUDA__)
    bool _buf_index;
-
+    float* _doubled_buffer[2];
    cudaStream_t _streams[2];
+#endif
 };

 #if defined(__AVX512__) or defined(__AVX256__)
@@ -86,7 +104,7 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
                                 float* grads,
                                 float* _exp_avg_sq,
                                 size_t _param_size,
-                                 __half* dev_params,
+                                 ds_half_precision_t* dev_params,
                                 bool half_precision)
 {
    size_t new_rounded_size = 0;
@@ -104,7 +122,9 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
        size_t copy_size = TILE;
        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
        size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#endif
 #pragma omp parallel for
        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
            AVX_Data grad_4[span];
@@ -128,12 +148,14 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
            simd_fma<span>(param_4, grad_4, step_size_4, param_4);

            simd_store<span>(_params + i, param_4, half_precision);
+#if defined(__ENABLE_CUDA__)
            if (dev_params) {
                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
            }
+#endif
            simd_store<span>(_exp_avg_sq + i, variance_4, false);
        }
-
+#if defined(__ENABLE_CUDA__)
        if (dev_params) {
            if (half_precision)
                launch_param_update_half(
@@ -144,6 +166,7 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,

            _buf_index = !_buf_index;
        }
+#endif
    }
    *rounded_size = new_rounded_size;
 }

--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once

 #define NOMINMAX  // Windows idiosyncrasy
                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c

-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
 #include <stdio.h>
 #include <cassert>
+#include "simd.h"
+
+#if defined(__ENABLE_CUDA__)
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
 #include "cuda.h"
 #include "custom_cuda_layers.h"
-#include "simd.h"
+typedef __half ds_half_precision_t;
+#else
+#include <cmath>
+typedef unsigned short ds_half_precision_t;
+#endif

-#define STEP(SPAN)                                \
-    void Step_##SPAN(float* _params,              \
-                     float* grads,                \
-                     float* _exp_avg,             \
-                     float* _exp_avg_sq,          \
-                     size_t _param_size,          \
-                     __half* dev_param = nullptr, \
+#define STEP(SPAN)                                             \
+    void Step_##SPAN(float* _params,                           \
+                     float* grads,                             \
+                     float* _exp_avg,                          \
+                     float* _exp_avg_sq,                       \
+                     size_t _param_size,                       \
+                     ds_half_precision_t* dev_param = nullptr, \
                     bool half_precision = false);

 class Adam_Optimizer {
@@ -36,20 +47,25 @@ public:
          _betta1_t(1.0),
          _betta2_t(1.0),
          _step(0),
-          _buf_index(false),
          _adamw_mode(adamw_mode)
    {
+#if defined(__ENABLE_CUDA__)
        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));

        _streams[0] = Context::Instance().GetCurrentStream();
        _streams[1] = Context::Instance().GetNewStream();
+        _buf_index = false;
+#endif
    }
    ~Adam_Optimizer()
    {
+#if defined(__ENABLE_CUDA__)
        cudaFreeHost(_doubled_buffer[0]);
        cudaFreeHost(_doubled_buffer[1]);
+#endif
    }
+
 #if defined(__AVX512__) or defined(__AVX256__)
    template <int span>
    void Step_AVX(size_t* rounded_size,
@@ -58,16 +74,18 @@ public:
                  float* _exp_avg,
                  float* _exp_avg_sq,
                  size_t param_size,
-                  __half* dev_param = nullptr,
+                  ds_half_precision_t* dev_param = nullptr,
                  bool half_precision = false);
 #endif
    STEP(1)
    STEP(4)
    STEP(8)
+#if defined(__ENABLE_CUDA__)
    inline void SynchronizeStreams()
    {
        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
    }
+#endif
    inline void IncrementStep(size_t step, float beta1, float beta2)
    {
        if (beta1 != _betta1 || beta2 != _betta2) {
@@ -116,11 +134,13 @@ private:
    float _bias_correction1;
    float _bias_correction2;

-    float* _doubled_buffer[2];
-    bool _buf_index;
    bool _adamw_mode;

+#if defined(__ENABLE_CUDA__)
+    float* _doubled_buffer[2];
    cudaStream_t _streams[2];
+    bool _buf_index;
+#endif
 };

 #if defined(__AVX512__) or defined(__AVX256__)
@@ -131,10 +151,11 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
                              float* _exp_avg,
                              float* _exp_avg_sq,
                              size_t _param_size,
-                              __half* dev_params,
+                              ds_half_precision_t* dev_params,
                              bool half_precision)
 {
    size_t new_rounded_size = 0;
+    int rshft = half_precision ? 1 : 0;

    AVX_Data betta1_4;
    betta1_4.data = SIMD_SET(_betta1);
@@ -167,11 +188,13 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
        size_t copy_size = TILE;
        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
        size_t offset = copy_size + t;
+#if defined(__ENABLE_CUDA__)
        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+#endif
 #pragma omp parallel for
        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
            AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
+            simd_load<span>(grad_4, grads + (i >> rshft), half_precision);

            AVX_Data momentum_4[span];
            simd_load<span>(momentum_4, _exp_avg + i, false);
@@ -180,7 +203,7 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
            simd_load<span>(variance_4, _exp_avg_sq + i, false);

            AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
+            simd_load<span>(param_4, _params + (i >> rshft), half_precision);

            if (_weight_decay > 0 && !_adamw_mode) {
                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
@@ -201,14 +224,16 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,

            simd_fma<span>(param_4, grad_4, step_size_4, param_4);

-            simd_store<span>(_params + i, param_4, half_precision);
+            simd_store<span>(_params + (i >> rshft), param_4, half_precision);
+#if defined(__ENABLE_CUDA__)
            if (dev_params) {
                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
            }
+#endif
            simd_store<span>(_exp_avg + i, momentum_4, false);
            simd_store<span>(_exp_avg_sq + i, variance_4, false);
        }
-
+#if defined(__ENABLE_CUDA__)
        if (dev_params) {
            if (half_precision)
                launch_param_update_half(
@@ -219,6 +244,7 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,

            _buf_index = !_buf_index;
        }
+#endif
    }
    *rounded_size = new_rounded_size;
 }

--- a/csrc/includes/cublas_wrappers.h
+++ b/csrc/includes/cublas_wrappers.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once

 #include <assert.h>

--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
 #pragma once

+#include "ds_kernel_utils.h"
+
 #include <cuda.h>
 #include <cuda_fp16.h>
+#include <curand_kernel.h>
 #include <stdio.h>
 #include <stdlib.h>

-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-#include <curand_kernel.h>
-
 #include "context.h"
 #include "cublas_wrappers.h"

@@ -45,30 +41,6 @@

 #define WARP_SIZE_BITS 5

-template <typename T>
-void launch_quantize_kernel(T* vals,
-                            int total_count,
-                            int group_num,
-                            int num_bits,
-                            cudaStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel(T* vals,
-                               int total_count,
-                               int group_num,
-                               int num_bits,
-                               cudaStream_t stream);
-template <typename T>
-void launch_quantize_kernel_asym(T* vals,
-                                 int total_count,
-                                 int group_num,
-                                 int num_bits,
-                                 cudaStream_t stream);
-template <typename T>
-void launch_sr_quantize_kernel_asym(T* vals,
-                                    int total_count,
-                                    int group_num,
-                                    int num_bits,
-                                    cudaStream_t stream);
 // Fused bias add with gelu activation
 template <typename T>
 void launch_bias_gelu(const T* input,
@@ -301,3 +273,54 @@ void launch_fuse_transpose_bias_kernel(const T* inp,

 void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
 void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream);
+
+void launch_token_sort(int32_t* indices,
+                       int layers,
+                       int batch_size,
+                       int reserved_size,
+                       int original_tokens,
+                       cudaStream_t stream);
+
+template <typename T>
+void launch_gather_tokens(T* retained_tokens,
+                          T* activations,
+                          int32_t* gather_indices,
+                          int32_t batch_size,
+                          int32_t sampled_tokens,
+                          int32_t channels,
+                          int32_t read_batch_stride,
+                          int32_t read_seq_stride,
+                          int32_t write_batch_stride,
+                          int32_t write_seq_stride,
+                          cudaStream_t stream);
+
+template <typename T>
+void launch_scatter_tokens(T* all_activations,
+                           T* layer_activations,
+                           int32_t* gather_indices,
+                           int32_t batch_size,
+                           int32_t sampled_tokens,
+                           int32_t channels,
+                           int32_t read_batch_stride,
+                           int32_t read_seq_stride,
+                           int32_t write_batch_stride,
+                           int32_t write_seq_stride,
+                           cudaStream_t stream);
+
+template <typename T>
+void launch_slice_gpt_mask(T* output_mask,
+                           const T* input_mask,
+                           int batch_size,
+                           int truncated_seq_len,
+                           int orig_seq_len,
+                           cudaStream_t stream);
+
+template <typename T>
+void launch_slice_bert_mask(T* output_mask,
+                            const T* input_mask,
+                            const int32_t* retained_indices,
+                            int32_t layers,
+                            int32_t batch_size,
+                            int32_t truncated_seq_len,
+                            int32_t orig_seq_len,
+                            cudaStream_t stream);
--- a/csrc/includes/dequantization_utils.h
+++ b/csrc/includes/dequantization_utils.h
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "quantization.h"
+#include "quantization_utils.h"
+
+namespace cg = cooperative_groups;
+
+#pragma once
+
+namespace dequantize {
+using Type = quantize::Type;
+
+template <Type qType, int numBits>
+using Params = quantize::Params<qType, numBits>;
+
+constexpr int granularity = quantize::granularity;
+using PackedInt4 = quantize::PackedInt4;
+
+constexpr int h_per_chunk = granularity / sizeof(__half);
+constexpr int h2_per_chunk = granularity / sizeof(__half2);
+
+/*
+Device function that reads quantized data from global memory, dequantizes
+it, and stores it to global memory.
+Template Arguments :
+    numBits - Number of bits in quantized element.      int: 4, 8
+    qType - Type of quantization to perform.            Type::Symmetric or Type::Asymmetric
+    unroll - Number of load steps to internally unroll  int
+    threads - Number of threads to perform dequant      int
+Function arguments:
+    global_output - __half pointer in global memory
+    data - Quantized data in global memory
+    global_params - Quantization parameters in global memory
+    elems_per_group - Number of elements in each quantization group
+    total_elems - Tensor size (note, does not need to be multiple of elems_per_group)
+*/
+template <int numBits, Type qType, int unroll, int threads>
+DS_D_INLINE void to_global(__half* global_output,
+                           const int8_t* data,
+                           const float* global_params,
+                           const int elems_per_group,
+                           const int total_elems);
+
+/*
+Device function that quantizes 16 bytes of __half type input data.
+Template Arguments :
+    numBits -   Number of bits in quantized element.    int : 8 or 4
+    qType   - Type of quantization to perform.          Type::Symmetric or Type::Asymmetric
+Function Arguments :
+    local_output -  Local array to store dequantized data       __half* or __half2*
+    data         -  Pointer to quantized input data.            int8_t*
+    Params       -  Parameters for quantization.                Params<qType, numBits>
+*/
+template <int numBits, Type qType>
+DS_D_INLINE void chunk(__half2* local_output, const int8_t* data, Params<qType, numBits> q_params);
+
+template <typename T, int numBits, Type qType>
+DS_D_INLINE void chunk(T* local_output, const int8_t* data, Params<qType, numBits> q_params);
+
+/**************** Implementations ******************/
+
+template <typename T, int numBits, Type qType>
+DS_D_INLINE void chunk(T* local_output, const int8_t* data, Params<qType, numBits> q_params)
+{
+    constexpr int32_t num_elems_packed = 8 / numBits;
+    constexpr int32_t iters = h_per_chunk / num_elems_packed;
+
+#pragma unroll
+    for (int i = 0; i < iters; i++) {
+        if constexpr (num_elems_packed == 1) {
+            local_output[i] = q_params.template dequantize<T>(data[i]);
+        } else {
+            auto accessible_data = *(PackedInt4*)(&data[i]);
+            local_output[2 * i] = q_params.template dequantize<T>(accessible_data.low);
+            local_output[2 * i + 1] = q_params.template dequantize<T>(accessible_data.high);
+        }
+    }
+}
+
+template <int numBits, Type qType>
+DS_D_INLINE void chunk(__half2* local_output, const int8_t* data, Params<qType, numBits> q_params)
+{
+    __half* local_output_cast = reinterpret_cast<__half*>(local_output);
+    chunk<__half, numBits>(local_output_cast, data, q_params);
+}
+
+template <typename T, int numBits, Type qType, int unroll, int threads>
+DS_D_INLINE void _to_global(T* global_output,
+                            const int8_t* data,
+                            const float* global_params,
+                            const int elems_per_group,
+                            const int total_elems)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // Load constants
+    // TODO(cmikeh2): Refactor into functions?
+    constexpr int load_granularity = (granularity / (sizeof(T))) / (numBits == 8 ? 1 : 2);
+    constexpr int load_step_stride = load_granularity * threads;
+    constexpr int load_block_stride = load_step_stride * unroll;
+
+    // Store constants
+    constexpr int T_per_chunk = granularity / sizeof(T);
+    constexpr int store_step_stride = T_per_chunk * threads;
+    constexpr int store_block_stride = store_step_stride * unroll;
+
+    // Load offsets
+    const int load_block_offset = tb.group_index().x * load_block_stride;
+    // Note: we can use `load_granularity` since the dtype is `int8_t`.
+    const int load_thread_offset = tb.thread_index().x * load_granularity;
+    const int8_t* load_base = data + load_block_offset + load_thread_offset;
+
+    // Store offsets
+    const int store_block_offset = tb.group_index().x * store_block_stride;
+    const int store_thread_offset = tb.thread_index().x * T_per_chunk;
+    const int elem_id_base = store_block_offset + store_thread_offset;
+
+    int8_t local_load_buffer[load_granularity * unroll];
+    T local_dequant_buffer[T_per_chunk * unroll];
+
+    /*
+    Note: Splitting this loop in half gave about 3-5% performance increase for reasons that aren't
+    totally clear to me, so this is a deliberately weird code structure.
+    */
+#pragma unroll
+    for (int i = 0; i < unroll; i++) {
+        const int elem_id_iter = elem_id_base + i * store_step_stride;
+
+        if (elem_id_iter < total_elems) {
+            mem_access::load_global<load_granularity>(local_load_buffer + i * load_granularity,
+                                                      load_base + i * load_step_stride);
+        }
+    }
+
+#pragma unroll
+    for (int i = 0; i < unroll; i++) {
+        const int elem_id_iter = elem_id_base + i * store_step_stride;
+        if (elem_id_iter < total_elems) {
+            // TODO(cmikeh2): Can we amortize this division? Perform once on the first iteration and
+            // use indexing math to do division free interpolation of the successive groups?
+            const int group_index = elem_id_iter / elems_per_group;
+            Params<qType, numBits> q_params(global_params, group_index);
+
+            chunk<T, numBits, qType>(local_dequant_buffer + i * T_per_chunk,
+                                     local_load_buffer + i * load_granularity,
+                                     q_params);
+            mem_access::store_global<granularity>(global_output + elem_id_iter,
+                                                  local_dequant_buffer + i * T_per_chunk);
+        }
+    }
+}
+
+template <typename T, int numBits, Type qType, int unroll, int threads>
+DS_D_INLINE void to_global(T* global_output,
+                           const int8_t* data,
+                           const float* global_params,
+                           const int elems_per_group,
+                           const int total_elems)
+{
+    if constexpr (numBits == 4 || numBits == 8) {
+        _to_global<T, numBits, qType, unroll, threads>(
+            global_output, data, global_params, elems_per_group, total_elems);
+    } else if constexpr (numBits == 3) {
+        // TODO(cmikeh2): Need this implementation
+        assert(false);
+    } else {
+        assert(false);
+    }
+}
+
+}  // namespace dequantize
--- a/csrc/includes/dropout.h
+++ b/csrc/includes/dropout.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once

 #include <cuda.h>

--- a/csrc/includes/ds_kernel_utils.h
+++ b/csrc/includes/ds_kernel_utils.h
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+
+Centralized header file for preprocessor macros and constants
+used throughout the codebase.
+*/
+
+#pragma once
+
+#include <cuda.h>
+
+#define DS_HD_INLINE __host__ __device__ __forceinline__
+#define DS_D_INLINE __device__ __forceinline__
+
+#ifdef __HIP_PLATFORM_HCC__
+
+// constexpr variant of warpSize for templating
+constexpr int hw_warp_size = 64;
+#define HALF_PRECISION_AVAILABLE = 1
+#include <hip/hip_cooperative_groups.h>
+
+#else  // !__HIP_PLATFORM_HCC__
+
+// constexpr variant of warpSize for templating
+constexpr int hw_warp_size = 32;
+
+#if __CUDA_ARCH__ >= 530
+#define HALF_PRECISION_AVAILABLE = 1
+#define PTX_AVAILABLE
+#endif  // __CUDA_ARCH__ >= 530
+
+#if __CUDA_ARCH__ >= 800
+#define ASYNC_COPY_AVAILABLE
+#define BF16_AVAILABLE
+#endif  // __CUDA_ARCH__ >= 800
+
+#include <cooperative_groups.h>
+
+#endif  //__HIP_PLATFORM_HCC__
+
+inline int next_pow2(const int val)
+{
+    int rounded_val = val - 1;
+    rounded_val |= rounded_val >> 1;
+    rounded_val |= rounded_val >> 2;
+    rounded_val |= rounded_val >> 4;
+    rounded_val |= rounded_val >> 8;
+    return rounded_val + 1;
+}
--- a/csrc/includes/ds_transformer_cuda.h
+++ b/csrc/includes/ds_transformer_cuda.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once

 #include <cuda_runtime_api.h>

--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #ifndef __FEEDFORWARD_H__
 #define __FEEDFORWARD_H__


--- a/csrc/includes/gelu.h
+++ b/csrc/includes/gelu.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once

 #include <cuda.h>

--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/

 #pragma once


--- a/csrc/includes/general_kernels.h
+++ b/csrc/includes/general_kernels.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <stdio.h>

--- a/csrc/includes/memory_access_utils.h
+++ b/csrc/includes/memory_access_utils.h
--- a/csrc/includes/normalize_layer.h
+++ b/csrc/includes/normalize_layer.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once

 #include <cuda.h>

--- a/csrc/includes/quantization.h
+++ b/csrc/includes/quantization.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include "ds_kernel_utils.h"
+
+namespace quantize {
+
+enum class Type { Symmetric, Asymmetric };
+
+struct PackedInt4 {
+    int8_t high : 4;
+    int8_t low : 4;
+};
+
+DS_HD_INLINE bool requires_offset(Type qType) { return qType == Type::Asymmetric; }
+
+}  // namespace quantize
+
+void launch_quant(int8_t* output_data,
+                  float* params,
+                  const __half* input_data,
+                  const int groups,
+                  const int elems_per_group,
+                  const int num_bits,
+                  const quantize::Type quant_type,
+                  cudaStream_t stream);
+
+template <typename T>
+void launch_dequantize_kernel(T* dequant_data,
+                              const int8_t* q_data,
+                              const float* q_params,
+                              quantize::Type q_type,
+                              int num_bits,
+                              int elems_per_group,
+                              int total_elems,
+                              cudaStream_t stream);
+
+template <typename T>
+void launch_fake_quantize_kernel(T* vals,
+                                 int total_count,
+                                 int group_num,
+                                 int num_bits,
+                                 cudaStream_t stream);
+template <typename T>
+void launch_sr_fake_quantize_kernel(T* vals,
+                                    int total_count,
+                                    int group_num,
+                                    int num_bits,
+                                    cudaStream_t stream);
+template <typename T>
+void launch_fake_quantize_kernel_asym(T* vals,
+                                      int total_count,
+                                      int group_num,
+                                      int num_bits,
+                                      cudaStream_t stream);
+template <typename T>
+void launch_sr_fake_quantize_kernel_asym(T* vals,
+                                         int total_count,
+                                         int group_num,
+                                         int num_bits,
+                                         cudaStream_t stream);
--- a/csrc/includes/quantization_utils.h
+++ b/csrc/includes/quantization_utils.h
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
+
+#include <cassert>
+#include "conversion_utils.h"
+#include "ds_kernel_utils.h"
+#include "memory_access_utils.h"
+#include "quantization.h"
+#include "reduction_utils.h"
+
+#pragma once
+
+using rop = reduce::ROpType;
+
+namespace quantize {
+constexpr int granularity = 16;
+constexpr int h_per_load = granularity / sizeof(__half);
+constexpr int h2_per_load = granularity / sizeof(__half2);
+constexpr int max_threads = 1024;
+
+/*
+Class to hold the quantization parameters for a given tensor.
+Holds the implementation of the quantization operation.
+*/
+template <Type qType, int numBits>
+class Params {
+public:
+    /*
+    Quantization implementation, supports
+    1) 4 Bit
+    2) 8 Bit
+    3) Symmetric
+    4) Asymmetric
+    Function Arguments :
+        val : The __half value to quantize.
+    */
+    DS_D_INLINE int8_t quantize(__half val);
+
+    template <typename T>
+    DS_D_INLINE T dequantize(int8_t val);
+
+    DS_D_INLINE void store(float* params, int group_index);
+
+    // Initialize from memory
+    DS_D_INLINE Params(const float* params, int group_index);
+};
+
+template <int numBits>
+class Params<Type::Symmetric, numBits> {
+public:
+    float scale;
+
+    DS_D_INLINE Params(float max)
+    {
+        if (max == 0) {
+            scale = 1.0;
+        } else {
+            scale = (1 << numBits) / (2 * max);
+        }
+    }
+
+    DS_D_INLINE int8_t quantize(__half val)
+    {
+        constexpr int32_t q_min = -(1 << (numBits - 1));
+        constexpr int32_t q_max = (1 << (numBits - 1)) - 1;
+
+        float val_f = conversion::to<float>(val) * scale;
+        int32_t data_i32 = conversion::to<int32_t>(val_f);
+        data_i32 = min(max(data_i32, q_min), q_max);
+        return (int8_t)data_i32;
+    }
+
+    template <typename T>
+    DS_D_INLINE T dequantize(int8_t val)
+    {
+        const float val_deq_f = conversion::to<float>(val) * scale;
+        return conversion::to<T>(val_deq_f);
+    }
+
+    DS_D_INLINE void store(float* params, int group_index)
+    {
+        const float store_scale = 1 / scale;
+        mem_access::store_global<sizeof(float)>(params + group_index, &store_scale);
+    }
+
+    DS_D_INLINE Params(const float* params, int group_index)
+    {
+        mem_access::load_global<sizeof(float)>(&scale, params + group_index);
+    }
+};
+
+template <int numBits>
+class Params<Type::Asymmetric, numBits> {
+public:
+    float scale;
+    float offset;
+
+    DS_D_INLINE Params(float max, float min)
+    {
+        if (max == min) {
+            scale = 1.0;
+        } else {
+            scale = (1 << numBits) / (max - min);
+        }
+        offset = -(1 << (numBits - 1)) - (min * scale);
+    }
+
+    DS_D_INLINE int8_t quantize(__half val)
+    {
+        constexpr int32_t q_min = -(1 << (numBits - 1));
+        constexpr int32_t q_max = (1 << (numBits - 1)) - 1;
+
+        float val_f = conversion::to<float>(val) * scale + offset;
+        int32_t data_i32 = conversion::to<int32_t>(val_f);
+        data_i32 = min(max(data_i32, q_min), q_max);
+        return (int8_t)data_i32;
+    }
+
+    template <typename T>
+    DS_D_INLINE T dequantize(int8_t val)
+    {
+        const float val_deq_f = conversion::to<float>(val) * scale + offset;
+        return conversion::to<__half>(val_deq_f);
+    }
+
+    DS_D_INLINE void store(float* params, int group_index)
+    {
+        // Codegen should turn this into stg.64
+        const float store_scale = 1 / scale;
+        mem_access::store_global<sizeof(float)>(params + 2 * group_index, &store_scale);
+        mem_access::store_global<sizeof(float)>(params + 2 * group_index + 1, &offset);
+    }
+
+    DS_D_INLINE Params(const float* params, int group_index)
+    {
+        // Codegen should turn this into ldg.64
+        mem_access::load_global<sizeof(float)>(&scale, params + 2 * group_index);
+        mem_access::load_global<sizeof(float)>(&offset, params + 2 * group_index + 1);
+    }
+};
+
+/*
+Group stats tracks the necessary statistics about the quantized group
+to abstract the particulars for the main loop.
+*/
+template <Type qType>
+class GroupStats {
+public:
+    DS_D_INLINE void update(__half2 val);
+
+    DS_D_INLINE void reduce(cg::thread_block& tb, cg::thread_block_tile<hw_warp_size>& warp);
+};
+
+template <>
+class GroupStats<Type::Symmetric> {
+public:
+    // Symmetric quantization only tracks the maximum absolute value
+    __half2 cur_max;
+    float max;
+
+    /*
+    Technically, this would give bad results if there
+    are 0 values to process since the reduction would
+    give -inf instead of 0. We do not consider this
+    to be a reasonable edge case.
+    */
+    DS_D_INLINE GroupStats() { cur_max = reduce::init<rop::Max, __half2>(); }
+
+    /*
+    Updated the running absmax used to calculate params.
+    Function Arguments :
+        val : The __half2 value to update the running min and max with.
+    */
+    DS_D_INLINE void update(__half2 val)
+    {
+        cur_max = reduce::element<rop::Max>(cur_max, __habs2(val));
+    }
+
+    /*
+    Function to return calculated quantization params.
+    Template Arguments :
+        numBits -   Number of bits in quantized element.    int : 8 or 4
+    Function Arguments :
+        tb      -   Threadblock object. cg::thread_block
+        warp    -   Warp object.        cg::thread_block_tile<hw_warp_size>
+    */
+    template <int numBits, int threads_per_group>
+    DS_D_INLINE Params<Type::Symmetric, numBits> get_params(
+        cg::thread_block& tb,
+        cg::thread_block_tile<hw_warp_size>& warp)
+    {
+        const float2 partial_max = conversion::to<float2>(cur_max);
+        float max = reduce::element<rop::Max>(partial_max.x, partial_max.y);
+
+        reduce::partitioned_block<rop::Max, threads_per_group>(tb, warp, max);
+        Params<Type::Symmetric, numBits> params(max);
+
+        return params;
+    }
+};
+
+template <>
+class GroupStats<Type::Asymmetric> {
+public:
+    __half2 cur_max;
+    __half2 cur_min;
+
+    /*
+    Initialize cur_max to -inf, cur_min to inf since
+    we are doing a true range analysis.
+    */
+    DS_D_INLINE GroupStats()
+    {
+        cur_max = reduce::init<rop::Max, __half2>();
+        cur_min = reduce::init<rop::Min, __half2>();
+    }
+
+    /*
+    Updated the running min and max used to calculate params.
+    Function Arguments :
+        val : The __half2 value to update the running min and max with.
+    */
+    DS_D_INLINE void update(__half2 val)
+    {
+        cur_max = reduce::element<rop::Max>(cur_max, val);
+        cur_min = reduce::element<rop::Min>(cur_min, val);
+    }
+
+    /*
+    Function to return calculated quantization params.
+    Template Arguments :
+        numBits -   Number of bits in quantized element.    int : 8 or 4
+    Function Arguments :
+        tb      -   Threadblock object. cg::thread_block
+        warp    -   Warp object.        cg::thread_block_tile<hw_warp_size>
+    */
+    template <int numBits, int threads_per_group>
+    DS_D_INLINE Params<Type::Asymmetric, numBits> get_params(
+        cg::thread_block& tb,
+        cg::thread_block_tile<hw_warp_size>& warp)
+    {
+        const float2 partial_max = conversion::to<float2>(cur_max);
+        float max = reduce::element<rop::Max>(partial_max.x, partial_max.y);
+
+        const float2 partial_min = conversion::to<float2>(cur_min);
+        float min = reduce::element<rop::Min>(partial_min.x, partial_min.y);
+
+        reduce::partitioned_block<rop::Max, rop::Min, threads_per_group>(tb, warp, max, min);
+
+        Params<Type::Asymmetric, numBits> params(max, min);
+
+        return params;
+    }
+};
+
+/*
+Device function that quantizes 16 bytes of __half type input data.
+Template Arguments :
+    numBits -   Number of bits in quantized element.    int : 8 or 4
+    qType   - Type of quantization to perform.          Type::Symmetric or Type::Asymmetric
+Function Arguments :
+    local_output -  Pointer to local memory to store quantized data.    int8_t*
+    data         -  Pointer to input data.                              __half*
+    Params       -  Parameters for quantization.                        Params<qType, numBits>
+*/
+template <int numBits, Type qType>
+DS_D_INLINE void _chunk(int8_t* local_output, const __half* data, Params<qType, numBits> q_params);
+
+/*
+Device function that quantizes 16 bytes of __half2 type input data.
+Template Arguments :
+    numBits -   Number of bits in quantized element.    int : 8 or 4
+    qType   -   Type of quantization to perform.        Type::Symmetric or Type::Asymmetric
+Function Arguments :
+    local_output -  Pointer to local memory to store quantized data.    int8_t*
+    data         -  Pointer to input data.                              __half2*
+    Params       -  Parameters for quantization.                        Params<qType, numBits>
+*/
+template <int numBits, Type qType>
+DS_D_INLINE void _chunk(int8_t* local_output, const __half2* data, Params<qType, numBits> q_params);
+
+/*
+Helper function to do serial reduction on register-file arrays.
+Template Arguments :
+    qType       -   Type of quantization to perform.        Type::Symmetric or Type::Asymmetric
+    numChunks   -   Number of bits in quantized element.    int : 8 or 4
+Function Arguments :
+    local_buffer    -   Pointer memory with input half2 data to be quantized.
+*/
+template <Type qType, int numChunks>
+DS_D_INLINE GroupStats<qType> _local_serial_reduce(__half2* local_buffer);
+
+/*
+The main loop of the kernel that quantizes array in local memory of __half2 type input data, when
+Quantization parameters are pre-computed.
+Template Arguments :
+    qType       -   Type of quantization to perform.            Type::Symmetric or Type::Asymmetric
+    numBits     -   Number of bits in quantized element.        int : 8 or 4
+    numChunks   -   Number of chunks(16 bytes of Input data).   int : 8 or 4
+Function Arguments :
+    local_buffer    -   Pointer memory with input half2 data to be quantized.
+    scales          -   Pointer to output scales.
+    offsets         -   Pointer to output offsets.
+    output_data     -   Pointer to output data.
+    elems_per_group -   Number of elements to quantize in a group.
+    q_params        -   Quantization parameters.
+*/
+template <int numBits, Type qType, int numChunks, int threads_per_group, int max_threads>
+DS_D_INLINE void local_array(cg::thread_block& tb,
+                             cg::thread_block_tile<hw_warp_size>& warp,
+                             __half2* local_buffer,
+                             float* __restrict__ scales,
+                             float* __restrict__ offsets,
+                             int8_t* __restrict__ output_data,
+                             const int& elems_per_group,
+                             const int& groups,
+                             Params<qType, numBits> q_params);
+
+/*
+The main loop of the kernel that quantizes array in local memory of __half2 type input data.
+This function computes quantization parameters for each group.
+Template Arguments :
+    qType   -   Type of quantization to perform.                Type::Symmetric or Type::Asymmetric
+    numBits     -   Number of bits in quantized element.        int : 8 or 4
+    numChunks   -   Number of chunks(16 bytes of Input data).   int : 8 or 4
+Function Arguments :
+    local_buffer    -   Pointer memory with input half2 data to be quantized.
+    scales          -   Pointer to output scales.
+    offsets         -   Pointer to output offsets.
+    output_data     -   Pointer to output data.
+    elems_per_group -   Number of elements to quantize in a group.
+*/
+template <Type qType, int numBits, int numChunks, int threads_per_group, int max_threads>
+__device__ void local_array(__half2* local_buffer,
+                            float* __restrict__ scales,
+                            float* __restrict__ offsets,
+                            int8_t* __restrict__ output_data,
+                            const int& elems_per_group,
+                            const int& groups);
+
+template <int numBits, Type qType>
+DS_D_INLINE void _chunk(int8_t* local_output, const __half* data, Params<qType, numBits> q_params)
+{
+    constexpr int32_t elems = 16 / sizeof(__half);
+    constexpr int32_t num_elems_packed = 8 / numBits;
+
+#pragma unroll
+    for (int i = 0, oi = 0; i < elems; i += num_elems_packed, oi++) {
+        if (num_elems_packed == 1) {
+            // TODO(cmikeh2): refactor to use conversion utils
+            local_output[i] = q_params.quantize(data[i]);
+        } else if (num_elems_packed == 2) {
+            int8_t data_i8_1 = q_params.quantize(data[i]);
+            int8_t data_i8_2 = q_params.quantize(data[i + 1]);
+            auto data_i8 = PackedInt4{data_i8_2, data_i8_1};
+            local_output[oi] = *((int8_t*)(&data_i8));
+        }
+    }
+}
+
+template <int numBits, Type qType>
+DS_D_INLINE void _chunk(int8_t* local_output, const __half2* data, Params<qType, numBits> q_params)
+{
+    const __half* data_cast = reinterpret_cast<const __half*>(data);
+    _chunk<numBits>(local_output, data_cast, q_params);
+}
+
+template <Type qType, int numChunks>
+DS_D_INLINE GroupStats<qType> _local_serial_reduce(__half2* local_buffer)
+{
+    GroupStats<qType> stats;
+#pragma unroll
+    for (int i = 0; i < numChunks * h2_per_load; i++) { stats.update(local_buffer[i]); }
+
+    return stats;
+}
+
+template <Type qType, int numBits, int numChunks, int threads_per_group, int max_threads>
+DS_D_INLINE void local_array(cg::thread_block& tb,
+                             cg::thread_block_tile<hw_warp_size>& warp,
+                             __half2* local_buffer,
+                             float* __restrict__ global_params,
+                             int8_t* __restrict__ output_data,
+                             const int& elems_per_group,
+                             const int& groups,
+                             Params<qType, numBits> q_params)
+{
+    constexpr int num_ele_int8 = 8 / numBits;
+    constexpr int num_int8_out = quantize::h_per_load / num_ele_int8;
+
+    // Indexing offsets
+    const int block_num =
+        (tb.group_index().x * max_threads / threads_per_group) + tb.thread_index().y;
+    const int block_offset = block_num * elems_per_group;
+    const int elem_offset = tb.thread_index().x * quantize::h_per_load;
+    const int base_offset = (block_offset + elem_offset) / num_ele_int8;
+    const int stride = tb.size() * quantize::h_per_load / num_ele_int8;
+
+    int8_t local_output[num_int8_out];
+
+    if (tb.thread_index().x == 0 && block_num < groups) {
+        q_params.store(
+            global_params,
+            (tb.group_index().x * max_threads / threads_per_group) + tb.thread_index().y);
+    }
+#pragma unroll
+    for (int i = 0; i < numChunks; i++) {
+        if (elem_offset + i * stride * num_ele_int8 < elems_per_group && block_num < groups) {
+            quantize::_chunk<numBits, qType>(
+                local_output, local_buffer + i * quantize::h2_per_load, q_params);
+            mem_access::store_global<num_int8_out>(output_data + (base_offset + i * stride),
+                                                   local_output);
+        }
+    }
+}
+
+template <Type qType, int numBits, int numChunks, int threads_per_group, int max_threads>
+DS_D_INLINE void local_array(cg::thread_block& tb,
+                             cg::thread_block_tile<hw_warp_size>& warp,
+                             __half* local_buffer,
+                             float* __restrict__ global_params,
+                             int8_t* __restrict__ output_data,
+                             const int& elems_per_group,
+                             const int& groups,
+                             Params<qType, numBits> q_params)
+{
+    __half2* local_buffer_h2 = reinterpret_cast<__half2*>(local_buffer);
+
+    quantize::local_array<qType, numBits, numChunks, threads_per_group, max_threads>(
+        tb, warp, local_buffer, global_params, output_data, elems_per_group, groups, q_params);
+}
+
+template <Type qType,
+          int numBits,
+          int numChunks,
+          int threads_per_group = max_threads,
+          int max_threads = 256>
+__device__ void local_array(__half2* local_buffer,
+                            float* __restrict__ global_params,
+                            int8_t* __restrict__ output_data,
+                            const int& elems_per_group,
+                            const int& groups)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    auto group_stats = _local_serial_reduce<qType, numChunks>(local_buffer);
+    auto params = group_stats.template get_params<numBits, threads_per_group>(tb, warp);
+
+    quantize::local_array<qType, numBits, numChunks, threads_per_group, max_threads>(
+        tb, warp, local_buffer, global_params, output_data, elems_per_group, groups, params);
+}
+
+template <Type qType, int numBits, int numChunks, int threads_per_group, int max_threads>
+__device__ void local_array(__half* local_buffer,
+                            float* __restrict__ global_params,
+                            int8_t* __restrict__ output_data,
+                            const int& elems_per_group,
+                            const int& groups)
+{
+    __half2* local_buffer_h2 = reinterpret_cast<__half2*>(local_buffer);
+    quantize::local_array<qType, numBits, numChunks, threads_per_group, max_threads>(
+        local_buffer_h2, global_params, output_data, elems_per_group, groups);
+}
+
+}  // namespace quantize
--- a/csrc/includes/quantizer.h
+++ b/csrc/includes/quantizer.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once

 #include <cooperative_groups.h>

--- a/csrc/includes/reduction_utils.h
+++ b/csrc/includes/reduction_utils.h
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+
 #pragma once

 #if (__x86_64__ || __i386__)
@@ -22,7 +26,7 @@
 #define SIMD_WIDTH 16

 #define SIMD_LOAD2(x, h) \
-    ((h) ? _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)x)) : _mm512_loadu_ps(x))
+    ((h) ? _mm512_cvtph_ps(_mm256_castps_si256(_mm256_loadu_ps(x))) : _mm512_loadu_ps(x))
 #define SIMD_STORE2(x, d, h)                                                                      \
    ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
         : _mm512_storeu_ps(x, d))
@@ -60,18 +64,16 @@ union AVX_Data {
 template <int span>
 inline void simd_store(float* dst, AVX_Data* src, bool half_precision)
 {
+    size_t width = (half_precision ? SIMD_WIDTH / 2 : SIMD_WIDTH);
 #pragma unroll
-    for (size_t i = 0; i < span; ++i) {
-        SIMD_STORE2(dst + SIMD_WIDTH * i, src[i].data, half_precision);
-    }
+    for (size_t i = 0; i < span; ++i) { SIMD_STORE2(dst + width * i, src[i].data, half_precision); }
 }
 template <int span>
 inline void simd_load(AVX_Data* dst, float* src, bool half_precision)
 {
+    size_t width = (half_precision ? 1 : SIMD_WIDTH);
 #pragma unroll
-    for (size_t i = 0; i < span; ++i) {
-        dst[i].data = SIMD_LOAD2(src + SIMD_WIDTH * i, half_precision);
-    }
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD2(src + width * i, half_precision); }
 }
 template <int span>
 inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a)