push dsv0.8.2 version

67ea635f · aiss · 1b2721ad · 67ea635f · 67ea635f · 67ea635f
Commit 67ea635f authored Mar 30, 2023 by aiss
20 changed files
--- a/deepspeed/ops/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ b/deepspeed/ops/csrc/transformer/inference/includes/custom_cuda_layers.h
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
 #pragma once
-#ifdef __HIP_PLATFORM_HCC__
+#include "ds_kernel_utils.h"
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
 #include <cuda.h>
 #include <cuda_fp16.h>
@@ -19,12 +15,17 @@
 #define MAX_WARP_NUM 32
 #define WARP_SIZE 32
+#define MAX_THREADS 1024
 #define SMs 80
 #define MAX_REGISTERS 256
 template <typename T>
 void launch_attn_softmax_v2(T* vals,
                            T* mask,
+                            T* alibi,
+                            float layer_scale,
                            bool triangular,
                            bool recompute,
                            bool local_attention,
@@ -33,7 +34,9 @@ void launch_attn_softmax_v2(T* vals,
                            int heads,
                            int num_seq,
                            int sequence_length,
-                            float scale,
+                            int offset,
+                            int mask_stride,
+                            int mp_size,
                            cudaStream_t stream);
 // Fused bias add with gelu activation
@@ -43,6 +46,23 @@ void launch_bias_gelu(T* input,
                      int intermediate_size,
                      int batch_size,
                      cudaStream_t stream);
+template <typename T>
+void launch_fused_bias_geglu(T* output,
+                             const T* activation,
+                             const T* bias,
+                             int rows,
+                             int elems_per_row,
+                             cudaStream_t stream);
+// Fused bias add with relu activation
+template <typename T>
+void launch_bias_relu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      cudaStream_t stream);
 template <typename T>
 void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
@@ -55,32 +75,44 @@ void launch_bias_residual(T* input,
                          int batch,
                          int hidden_dim,
                          int mp_size,
+                          bool preln,
                          cudaStream_t stream);
 template <typename T>
-void launch_layer_norm(T* out,
+void launch_fused_ln(T* output,
-                       T* vals,
+                     const T* vals,
-                       const T* gamma,
+                     const T* gamma,
-                       const T* beta,
+                     const T* beta,
-                       float epsilon,
+                     float epsilon,
-                       int batch_size,
+                     int rows,
-                       int hidden_dim,
+                     int elems_per_row,
-                       cudaStream_t stream);
+                     cudaStream_t stream);
 template <typename T>
-void launch_residual_layer_norm(T* norm,
+void launch_fused_residual_ln(T* output,
-                                T* res_add,
+                              const T* vals,
-                                T* vals,
+                              const T* residual,
-                                T* residual,
+                              const T* bias,
-                                const T* bias,
+                              const T* gamma,
-                                const T* gamma,
+                              const T* beta,
-                                const T* beta,
+                              float epsilon,
-                                float epsilon,
+                              int rows,
-                                int batch_size,
+                              int elems_per_row,
-                                int hidden_dim,
+                              cudaStream_t stream);
-                                bool preLN,
-                                bool mlp_after_attn,
+template <typename T>
-                                cudaStream_t stream);
+void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
+                                               T* res_output,
+                                               const T* vals,
+                                               const T* residual,
+                                               const T* bias,
+                                               const T* gamma,
+                                               const T* beta,
+                                               float epsilon,
+                                               int rows,
+                                               int elems_per_row,
+                                               cudaStream_t stream);
 template <typename T>
 void launch_dequantize(T* output,
                       const int8_t* input,
@@ -92,6 +124,14 @@ void launch_dequantize(T* output,
                       cudaStream_t stream);
 template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       cudaStream_t stream);
+template <typename T>
 void launch_gptj_residual_add(T* input,
                              T* output,
                              T* attn,
@@ -113,7 +153,8 @@ void launch_apply_rotary_pos_emb(T* mixed_query,
                                 unsigned batch,
                                 bool rotate_half,
                                 bool rotate_every_two,
-                                 cudaStream_t stream);
+                                 cudaStream_t stream,
+                                 int max_out_tokens);
 template <typename T>
 void launch_moe_res_matmul(T* residual,
@@ -122,3 +163,60 @@ void launch_moe_res_matmul(T* residual,
                           int seq_len,
                           int hidden_dim,
                           cudaStream_t stream);
+// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
+template <typename T>
+void launch_transform4d_0213(T* out,
+                             const T* in,
+                             int batch_size,
+                             int heads,
+                             int seq_length,
+                             int hidden_dim,
+                             cudaStream_t stream,
+                             int trans_count);
+template <typename T>
+void launch_bias_add_transform_0213(T* outputs,
+                                    T* vals,
+                                    T* vals1,
+                                    const T* vals2,
+                                    const T* bias,
+                                    int batch_size,
+                                    int seq_length,
+                                    unsigned seq_offset,
+                                    int seq_length1,
+                                    int hidden_dim,
+                                    int heads,
+                                    int rotary_dim,
+                                    bool rotate_half,
+                                    bool rotate_every_two,
+                                    cudaStream_t stream,
+                                    int trans_count,
+                                    int max_out_tokens);
+template <typename T>
+void pad_data(T* padded_output,
+              T* output,
+              int bsz,
+              int head_size,
+              int padded_head_size,
+              cudaStream_t stream);
+template <typename T>
+void pad_head_seq(T* padded_output,
+                  T* output,
+                  int bsz,
+                  int seq_len,
+                  int padded_seq_len,
+                  int head_size,
+                  int padded_head_size,
+                  cudaStream_t stream);
+template <typename T>
+void launch_pad_add_transform_0213(T* output,
+                                   const T* vals,
+                                   int batch_size,
+                                   int hidden_dim,
+                                   int seq_length,
+                                   int padded_seq_len,
+                                   int heads,
+                                   int padded_head_size,
+                                   cudaStream_t stream);
--- a/deepspeed/ops/csrc/transformer_bak/inference/includes/custom_hip_layers.h
+++ b/deepspeed/ops/csrc/transformer_bak/inference/includes/custom_hip_layers.h
 // !!! This is a file automatically generated by hipify!!!
+/*
+Copyright 2022 The Microsoft DeepSpeed Team
+*/
 #pragma once
-#ifdef __HIP_PLATFORM_HCC__
+#include "ds_kernel_utils_hip.h"
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
@@ -20,12 +16,17 @@
 #define MAX_WARP_NUM 32
 #define WARP_SIZE 32
+#define MAX_THREADS 1024
 #define SMs 80
 #define MAX_REGISTERS 256
 template <typename T>
 void launch_attn_softmax_v2(T* vals,
                            T* mask,
+                            T* alibi,
+                            float layer_scale,
                            bool triangular,
                            bool recompute,
                            bool local_attention,
@@ -34,7 +35,9 @@ void launch_attn_softmax_v2(T* vals,
                            int heads,
                            int num_seq,
                            int sequence_length,
-                            float scale,
+                            int offset,
+                            int mask_stride,
+                            int mp_size,
                            hipStream_t stream);
 // Fused bias add with gelu activation
@@ -44,6 +47,23 @@ void launch_bias_gelu(T* input,
                      int intermediate_size,
                      int batch_size,
                      hipStream_t stream);
+template <typename T>
+void launch_fused_bias_geglu(T* output,
+                             const T* activation,
+                             const T* bias,
+                             int rows,
+                             int elems_per_row,
+                             hipStream_t stream);
+// Fused bias add with relu activation
+template <typename T>
+void launch_bias_relu(T* input,
+                      const T* bias,
+                      int intermediate_size,
+                      int batch_size,
+                      hipStream_t stream);
 template <typename T>
 void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream);
@@ -56,32 +76,44 @@ void launch_bias_residual(T* input,
                          int batch,
                          int hidden_dim,
                          int mp_size,
+                          bool preln,
                          hipStream_t stream);
 template <typename T>
-void launch_layer_norm(T* out,
+void launch_fused_ln(T* output,
-                       T* vals,
+                     const T* vals,
-                       const T* gamma,
+                     const T* gamma,
-                       const T* beta,
+                     const T* beta,
-                       float epsilon,
+                     float epsilon,
-                       int batch_size,
+                     int rows,
-                       int hidden_dim,
+                     int elems_per_row,
-                       hipStream_t stream);
+                     hipStream_t stream);
 template <typename T>
-void launch_residual_layer_norm(T* norm,
+void launch_fused_residual_ln(T* output,
-                                T* res_add,
+                              const T* vals,
-                                T* vals,
+                              const T* residual,
-                                T* residual,
+                              const T* bias,
-                                const T* bias,
+                              const T* gamma,
-                                const T* gamma,
+                              const T* beta,
-                                const T* beta,
+                              float epsilon,
-                                float epsilon,
+                              int rows,
-                                int batch_size,
+                              int elems_per_row,
-                                int hidden_dim,
+                              hipStream_t stream);
-                                bool preLN,
-                                bool mlp_after_attn,
+template <typename T>
-                                hipStream_t stream);
+void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
+                                               T* res_output,
+                                               const T* vals,
+                                               const T* residual,
+                                               const T* bias,
+                                               const T* gamma,
+                                               const T* beta,
+                                               float epsilon,
+                                               int rows,
+                                               int elems_per_row,
+                                               hipStream_t stream);
 template <typename T>
 void launch_dequantize(T* output,
                       const int8_t* input,
@@ -93,6 +125,14 @@ void launch_dequantize(T* output,
                       hipStream_t stream);
 template <typename T>
+void launch_dequantize(T* output,
+                       const int8_t* input,
+                       const float* qscale,
+                       unsigned output_size,
+                       unsigned hidden_dim,
+                       unsigned groups,
+                       hipStream_t stream);
+template <typename T>
 void launch_gptj_residual_add(T* input,
                              T* output,
                              T* attn,
@@ -114,7 +154,8 @@ void launch_apply_rotary_pos_emb(T* mixed_query,
                                 unsigned batch,
                                 bool rotate_half,
                                 bool rotate_every_two,
-                                 hipStream_t stream);
+                                 hipStream_t stream,
+                                 int max_out_tokens);
 template <typename T>
 void launch_moe_res_matmul(T* residual,
@@ -123,3 +164,60 @@ void launch_moe_res_matmul(T* residual,
                           int seq_len,
                           int hidden_dim,
                           hipStream_t stream);
+// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
+template <typename T>
+void launch_transform4d_0213(T* out,
+                             const T* in,
+                             int batch_size,
+                             int heads,
+                             int seq_length,
+                             int hidden_dim,
+                             hipStream_t stream,
+                             int trans_count);
+template <typename T>
+void launch_bias_add_transform_0213(T* outputs,
+                                    T* vals,
+                                    T* vals1,
+                                    const T* vals2,
+                                    const T* bias,
+                                    int batch_size,
+                                    int seq_length,
+                                    unsigned seq_offset,
+                                    int seq_length1,
+                                    int hidden_dim,
+                                    int heads,
+                                    int rotary_dim,
+                                    bool rotate_half,
+                                    bool rotate_every_two,
+                                    hipStream_t stream,
+                                    int trans_count,
+                                    int max_out_tokens);
+template <typename T>
+void pad_data(T* padded_output,
+              T* output,
+              int bsz,
+              int head_size,
+              int padded_head_size,
+              hipStream_t stream);
+template <typename T>
+void pad_head_seq(T* padded_output,
+                  T* output,
+                  int bsz,
+                  int seq_len,
+                  int padded_seq_len,
+                  int head_size,
+                  int padded_head_size,
+                  hipStream_t stream);
+template <typename T>
+void launch_pad_add_transform_0213(T* output,
+                                   const T* vals,
+                                   int batch_size,
+                                   int hidden_dim,
+                                   int seq_length,
+                                   int padded_seq_len,
+                                   int heads,
+                                   int padded_head_size,
+                                   hipStream_t stream);
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
 #include "custom_cuda_layers.h"
 namespace cg = cooperative_groups;
@@ -862,6 +866,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
                                   bool invertible,
                                   int row_stride)
 {
+#ifdef HALF_PRECISION_AVAILABLE
    int iteration_stride = blockDim.x;
    int iterations = row_stride / iteration_stride;
@@ -985,6 +990,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
        inp_grad_h[high_index] = temp;
    }
+#endif
 }
 template <>
@@ -1172,6 +1178,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
                                   __half* inp_grad,
                                   int row_stride)
 {
+#ifdef HALF_PRECISION_AVAILABLE
    int iteration_stride = blockDim.x;
    int iterations = row_stride / iteration_stride;
@@ -1290,6 +1297,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
        inp_grad_h[high_index] = temp;
    }
+#endif
 }
 template <>
@@ -1601,6 +1609,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
                                             bool invertible,
                                             int row_stride)
 {
+#ifdef HALF_PRECISION_AVAILABLE
    int iteration_stride = blockDim.x;
    int iterations = row_stride / iteration_stride;
@@ -1727,6 +1736,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
    }
+#endif
 }
 template <>
@@ -1922,6 +1932,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
                                             __half* inp_grad,
                                             int row_stride)
 {
+#ifdef HALF_PRECISION_AVAILABLE
    int iteration_stride = blockDim.x;
    int iterations = row_stride / iteration_stride;
@@ -2044,6 +2055,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
        __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
        inp_grad_h[high_index] = temp + out_grad_h2[high_index];
    }
+#endif
 }
 template <>

--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
 #include <math.h>
 #include "custom_cuda_layers.h"
 #include "general_kernels.h"
@@ -536,6 +540,102 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
    }
 }
+__global__ void softmax_backward_kernel_arbitrary_length(__half* grad /* input & output*/,
+                                                         const __half* output,
+                                                         int softmax_length)
+{
+    int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
+    int offset = batch_idx * softmax_length + threadIdx.x;
+    const float4* output_cast = reinterpret_cast<const float4*>(output);
+    float4* grad_cast = reinterpret_cast<float4*>(grad);
+    grad_cast += offset;
+    output_cast += offset;
+    float sum = 0.0;
+    int curr_idx = threadIdx.x;
+    while (curr_idx < softmax_length) {
+        float4 out_reg = output_cast[curr_idx];
+        float4 grad_reg = grad_cast[curr_idx];
+        __half2* out_h = reinterpret_cast<__half2*>(&out_reg);
+        __half2* grad_h = reinterpret_cast<__half2*>(&grad_reg);
+#pragma unroll
+        for (int m = 0; m < 4; m++) grad_h[m] *= out_h[m];
+        sum += ((float)grad_h[0].x + (float)grad_h[0].y + (float)grad_h[1].x + (float)grad_h[1].y) +
+               ((float)grad_h[2].x + (float)grad_h[2].y + (float)grad_h[3].x + (float)grad_h[3].y);
+        curr_idx += WARP_SIZE;
+    }
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#pragma unroll
+    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
+    curr_idx = threadIdx.x;
+    while (curr_idx < softmax_length) {
+        float4 out_reg = output_cast[curr_idx];
+        float4 grad_reg = grad_cast[curr_idx];
+        __half* grad_h = reinterpret_cast<__half*>(&grad_reg);
+        __half* out_h = reinterpret_cast<__half*>(&out_reg);
+#pragma unroll
+        for (int m = 0; m < 8; m++) grad_h[m] = (float)out_h[m] * ((float)grad_h[m] - sum);
+        grad_cast[curr_idx] = grad_reg;
+        curr_idx += WARP_SIZE;
+    }
+}
+__global__ void softmax_backward_kernel_arbitrary_length(float* grad /* input & output*/,
+                                                         const float* output,
+                                                         int softmax_length)
+{
+    int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
+    int offset = batch_idx * softmax_length + threadIdx.x;
+    const float4* output_cast = reinterpret_cast<const float4*>(output);
+    float4* grad_cast = reinterpret_cast<float4*>(grad);
+    grad_cast += offset;
+    output_cast += offset;
+    float sum = 0.0;
+    int curr_idx = threadIdx.x;
+    while (curr_idx < softmax_length) {
+        float4 out_reg = output_cast[curr_idx];
+        float4 grad_reg = grad_cast[curr_idx];
+        grad_reg.x *= out_reg.x;
+        grad_reg.y *= out_reg.y;
+        grad_reg.z *= out_reg.z;
+        grad_reg.w *= out_reg.w;
+        sum += (grad_reg.x + grad_reg.y + grad_reg.z + grad_reg.w);
+        curr_idx += WARP_SIZE;
+    }
+    cg::thread_block b = cg::this_thread_block();
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#pragma unroll
+    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
+    curr_idx = threadIdx.x;
+    while (curr_idx < softmax_length) {
+        float4 out_reg = output_cast[curr_idx];
+        float4 grad_reg = grad_cast[curr_idx];
+        grad_reg.x = out_reg.x * (grad_reg.x - sum);
+        grad_reg.y = out_reg.y * (grad_reg.y - sum);
+        grad_reg.z = out_reg.z * (grad_reg.z - sum);
+        grad_reg.w = out_reg.w * (grad_reg.w - sum);
+        grad_cast[curr_idx] = grad_reg;
+        curr_idx += WARP_SIZE;
+    }
+}
 template <typename T>
 void launch_attn_softmax_backward_v2(T* out_grad,
                                     const T* soft_inp,
@@ -575,10 +675,15 @@ void launch_attn_softmax_backward_v2(T* out_grad,
    else if (seq_length <= 2048)
        softmax_backward_kernel_v2<T, 64>
            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 4096)
+        softmax_backward_kernel_v2<T, 128>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
+    else if (seq_length <= 8192)
+        softmax_backward_kernel_v2<T, 256>
+            <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
    else
-        throw std::runtime_error(
+        softmax_backward_kernel_arbitrary_length<<<grid_dim, block_dim, 0, stream>>>(
-            std::string("Special sequence length found in softmax backward, seq_length: ") +
+            out_grad, soft_inp, seq_length / (4 << ((sizeof(T) & 2) >> 1)));
-            std::to_string(seq_length));
 }
 template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,

--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
 #include "custom_cuda_layers.h"
 #define rows_trans 16

--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -4,6 +4,7 @@ Copyright 2020 The Microsoft DeepSpeed Team
 import sys
 import types
+import json
 from typing import Optional, Union
 import torch
 from torch.optim import Optimizer
@@ -17,17 +18,18 @@ from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpe
 from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
 from .runtime.pipe.engine import PipelineEngine
 from .inference.engine import InferenceEngine
+from .inference.config import DeepSpeedInferenceConfig
 from .runtime.lr_schedules import add_tuning_arguments
 from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError
 from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .module_inject import replace_transformer_layer, revert_transformer_layer
-from .utils import log_dist
+from .utils import log_dist, OnDevice
-from .utils.distributed import init_distributed
+from .comm.comm import init_distributed
 from .runtime import zero
+from .runtime import DeepSpeedOptimizer, ZeROOptimizer
 from .pipe import PipelineModule
@@ -82,7 +84,7 @@ def initialize(args=None,
        mpu: Optional: A model parallelism unit object that implements
            get_{model,data}_parallel_{rank,group,world_size}()
-        dist_init_required: Optional: None will auto-initialize torch.distributed if needed,
+        dist_init_required: Optional: None will auto-initialize torch distributed if needed,
            otherwise the user can force it to be initialized or not via boolean.
        collate_fn: Optional: Merges a list of samples to form a
@@ -113,6 +115,10 @@ def initialize(args=None,
        __git_hash__,
        __git_branch__),
             ranks=[0])
+    # Disable zero.Init context if it's currently enabled
+    zero.partition_parameters.shutdown_init_context()
    assert model is not None, "deepspeed.initialize requires a model"
    if not isinstance(model, PipelineModule):
@@ -217,61 +223,57 @@ def add_config_arguments(parser):
    return parser
-def init_inference(model,
+def default_inference_config():
-                   triangular_masking=True,
+    """
-                   mp_size=1,
+        Return a default DeepSpeed inference configuration dictionary.
-                   training_mp_size=1,
+    """
-                   mpu=None,
+    return DeepSpeedInferenceConfig().dict()
-                   ep_group=None,
-                   expert_mp_group=None,
-                   checkpoint=None,
+def init_inference(model, config=None, **kwargs):
-                   dtype=None,
-                   injection_policy=None,
-                   replace_method='auto',
-                   quantization_setting=None,
-                   replace_with_kernel_inject=False,
-                   return_tuple=True,
-                   ep_size=1,
-                   moe=False,
-                   moe_experts=1,
-                   moe_type='standard',
-                   args=None):
    """Initialize the DeepSpeed InferenceEngine.
-    Arguments:
+    Description: all four cases are valid and supported in DS init_inference() API.
-        model: Required: nn.module class before apply any wrappers
-        triangular_masking: Required: this shows the type of masking for attention scores in transformer layer
+    # Case 1: user provides no config and no kwargs. Default config will be used.
-            note that the masking is application specific.
-        mp_size: Optional: Desired model parallel size, default is 1 meaning no
+    .. code-block:: python
-            model parallelism.
-        training_mp_size: Optional: if loading a checkpoint this is the mp size that it was trained with,
+        generator.model = deepspeed.init_inference(generator.model)
-            it may be different than what the mp size that you want to use during inference.
+        string = generator("DeepSpeed is")
+        print(string)
-        mpu: Optional: A model parallelism unit object that implements
+    # Case 2: user provides a config and no kwargs. User supplied config will be used.
-            get_{model,data}_parallel_{rank,group,world_size}()
+    .. code-block:: python
+        generator.model = deepspeed.init_inference(generator.model, config=config)
+        string = generator("DeepSpeed is")
+        print(string)
+    # Case 3: user provides no config and uses keyword arguments (kwargs) only.
-        checkpoint: Optional: Path to deepspeed compatible checkpoint or path to
+    .. code-block:: python
-            JSON with load policy.
-        dtype: Optional: Desired model data type, will convert model to this type.
+        generator.model = deepspeed.init_inference(generator.model,
-            Supported target types: torch.half, torch.int8, torch.float
+                                                    mp_size=world_size,
+                                                    dtype=torch.half,
+                                                    replace_with_kernel_inject=True)
+        string = generator("DeepSpeed is")
+        print(string)
-        injection_policy: Optional: Dictionary mapping a client nn.Module to its corresponding
+    # Case 4: user provides config and keyword arguments (kwargs). Both config and kwargs are merged and kwargs take precedence.
-            injection policy. e.g., {BertLayer : deepspeed.inference.HFBertLayerPolicy}
-        replace_method: Optional: If 'auto' DeepSpeed will automatically try and replace
+    .. code-block:: python
-            model modules with its optimized versions. If an injection_policy is set this will
-            override the automatic replacement behavior.
-        quantization_setting: Optional: Quantization settings used for quantizing your model using the MoQ.
+        generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
-            The setting can be one element or a tuple. If one value is passed in, we consider it as the number
+        string = generator("DeepSpeed is")
-            of groups used in quantization. A tuple is passed in if we want to mention that there is extra-grouping
+        print(string)
-            for the MLP part of a Transformer layer (e.g. (True, 8) shows we quantize the model using 8 groups for
-            all the network except the MLP part that we use 8 extra grouping).
+    Arguments:
-        replace_with_kernel_inject: If set we inject kernel as we initialize the inference-engine
+        model: Required: original nn.module object without any wrappers
+        config: Optional: instead of arguments, you can pass in a DS inference config dict or path to JSON file
    Returns:
        A deepspeed.InferenceEngine wrapped model.
@@ -282,24 +284,30 @@ def init_inference(model,
        __git_branch__),
             ranks=[0])
-    engine = InferenceEngine(model,
+    # Load config_dict from config first
-                             triangular_masking,
+    if config is None:
-                             mp_size,
+        config = {}
-                             training_mp_size,
+    if isinstance(config, str):
-                             ep_size,
+        with open(config, "r") as f:
-                             mpu,
+            config_dict = json.load(f)
-                             ep_group,
+    elif isinstance(config, dict):
-                             expert_mp_group,
+        config_dict = config
-                             checkpoint,
+    else:
-                             dtype,
+        raise ValueError(
-                             injection_policy,
+            f"'config' argument expected string or dictionary, got {type(config)}")
-                             return_tuple,
-                             replace_method,
+    # Update with values from kwargs, ensuring no conflicting overlap between config and kwargs
-                             quantization_setting,
+    overlap_keys = set(config_dict.keys()).intersection(kwargs.keys())
-                             replace_with_kernel_inject,
+    # If there is overlap, error out if values are different
-                             moe,
+    for key in overlap_keys:
-                             moe_experts,
+        if config_dict[key] != kwargs[key]:
-                             moe_type,
+            raise ValueError(
-                             args)
+                f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}"
+            )
+    config_dict.update(kwargs)
+    ds_inference_config = DeepSpeedInferenceConfig(**config_dict)
+    engine = InferenceEngine(model, config=ds_inference_config)
    return engine
--- a/deepspeed/accelerator
+++ b/deepspeed/accelerator
+../accelerator/
\ No newline at end of file
--- a/deepspeed/autotuning/README.md
+++ b/deepspeed/autotuning/README.md
--- a/deepspeed/autotuning/__init__.py
+++ b/deepspeed/autotuning/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
 from .autotuner import Autotuner
--- a/deepspeed/autotuning/autotuner.py
+++ b/deepspeed/autotuning/autotuner.py
-import copy
+'''Copyright The Microsoft DeepSpeed Team'''
-import json
-import os
-from random import sample
 import shutil
 import subprocess
-import hjson
-import torch
 import time
 import datetime
 import math
+import hjson
 from ..runtime.config_utils import dict_raise_error_on_duplicate_keys
 from ..runtime.constants import *
-from ..runtime.zero.constants import *
+from ..runtime.zero.config import ZERO_OPTIMIZATION, ZeroStageEnum
 from ..utils import logger
 from .config import DeepSpeedAutotuningConfig
 from .constants import *
-from .scheduler import ResourceManager, run_experiment
+from .scheduler import ResourceManager
 from .tuner import GridSearchTuner, RandomTuner, ModelBasedTuner
 from .utils import *
+from deepspeed.accelerator import get_accelerator
 try:
    from tabulate import tabulate
 except ImportError:
    tabulate = None
+try:
+    import mlflow
+    has_mlflow = True
+except Exception as e:
+    has_mlflow = False
+ZERO_OPTIMIZATION_STAGE = "stage"
+OFFLOAD_OPTIMIZER = "offload_optimizer"
+OFFLOAD_PARAM = "offload_param"
+ZERO_OPTIMIZATION_STAGE_DEFAULT = ZeroStageEnum.disabled
 class Autotuner:
    """The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods.
@@ -42,22 +52,37 @@ class Autotuner:
        assert self.user_config is not None, "DeepSpeed configuration is not provided"
        self.autotuning_config = DeepSpeedAutotuningConfig(self.user_config)
+        if self.user_config[AUTOTUNING]:
+            if AUTOTUNING_EXPS_DIR in self.user_config[AUTOTUNING].keys():
+                del self.user_config[AUTOTUNING][AUTOTUNING_EXPS_DIR]
+            if AUTOTUNING_RESULTS_DIR in self.user_config[AUTOTUNING].keys():
+                del self.user_config[AUTOTUNING][AUTOTUNING_RESULTS_DIR]
-        self.exps_dir = DEFAULT_EXPRS_DIR
+        self.exps_dir = self.autotuning_config.exps_dir
-        if self.autotuning_config.exps_dir and self.autotuning_config.exps_dir != "":
-            self.exps_dir = self.autotuning_config.exps_dir
        if self.autotuning_config.overwrite and os.path.exists(self.exps_dir):
            shutil.rmtree(self.exps_dir, ignore_errors=True)
        if not os.path.exists(self.exps_dir):
-            os.makedirs(self.exps_dir, exist_ok=True)
+            try:
+                os.makedirs(self.exps_dir, exist_ok=True)
+                logger.info(f"Created autotuning experiments directory: {self.exps_dir}")
+            except:
+                logger.error(
+                    f"Failed to create {self.exps_dir}, please check `exps_dir` in the autotuning config file is accessible by all the nodes in the job."
+                )
+                exit(-1)
-        self.results_dir = DEFAULT_RESULTS_DIR
+        self.results_dir = self.autotuning_config.results_dir
-        if self.autotuning_config.results_dir and self.autotuning_config.results_dir != "":
-            self.results_dir = self.autotuning_config.results_dir
        if self.autotuning_config.overwrite and os.path.exists(self.results_dir):
            shutil.rmtree(self.results_dir, ignore_errors=True)
        if not os.path.exists(self.results_dir):
-            os.makedirs(self.results_dir, exist_ok=True)
+            try:
+                os.makedirs(self.results_dir, exist_ok=True)
+                logger.info(f"Created autotuning resutls directory: {self.exps_dir}")
+            except:
+                logger.error(
+                    f"Failed to create {self.results_dir}, please check `results_dir` in the autotuning config file is accessible by all the nodes in the job."
+                )
+                exit(-1)
        # set the active resource for the autotuner resource manager
        self.rm = self._get_resource_manager(active_resources)
@@ -70,6 +95,10 @@ class Autotuner:
            self.rm.nodes), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
        self.records = {}
+        self.optimal_cmd = None
+        self.optmal_ds_config = None
+        self.mlflow_parent_id = None
    def print_tuning_results(self):
        """Print the autotuning results in tabular format.
@@ -252,7 +281,7 @@ class Autotuner:
            return False
    def get_gpu_memory_info(self):
-        return torch.cuda.get_device_properties(0).total_memory
+        return get_accelerator().total_memory()
    def get_activation_memory_per_gpu(self):
        if self.model_info and "activation_mem_per_gpu" in self.model_info:
@@ -266,18 +295,18 @@ class Autotuner:
        if not num_params:
            return 0
        # assume the model uses Adam optimizer
-        # ZERO_OPTIMIZATION_DISABLED:
+        # ZeroStageEnum.disabled:
        params_mem = num_params * (2 if fp16_enabled else 4)
        gradients_mem = num_params * (2 if fp16_enabled else 4)
        optimizer_mem = num_params * (16 if fp16_enabled else 8)
-        if zero_stage >= ZERO_OPTIMIZATION_OPTIMIZER_STATES:
+        if zero_stage >= ZeroStageEnum.optimizer_states:
            optimizer_mem = optimizer_mem / total_gpus
-        if zero_stage >= ZERO_OPTIMIZATION_GRADIENTS:
+        if zero_stage >= ZeroStageEnum.gradients:
            gradients_mem = gradients_mem / total_gpus
-        if zero_stage >= ZERO_OPTIMIZATION_WEIGHTS:
+        if zero_stage >= ZeroStageEnum.weights:
            params_mem = params_mem / total_gpus
        mem_per_gpu = (params_mem + gradients_mem + optimizer_mem) / self.mp_size()
@@ -308,7 +337,7 @@ class Autotuner:
        # each zero stage uses a different template configuration file
        config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
-        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
+        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, ZERO_OPTIMIZATION_STAGE_DEFAULT)
        template_config = {}
        if stage == 0:
            template_path = DEFAULT_TEMPLATE_PATH_ZERO_0
@@ -331,12 +360,11 @@ class Autotuner:
            model_info = self.model_info
            if model_info and "hidden_size" in model_info:
                hs = model_info["hidden_size"]
+                template_config[ZERO_OPTIMIZATION]['reduce_bucket_size'] = hs * hs
                template_config[ZERO_OPTIMIZATION][
-                    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE] = hs * hs
+                    'stage3_prefetch_bucket_size'] = 0.9 * hs * hs
-                template_config[ZERO_OPTIMIZATION][
-                    ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE] = 0.9 * hs * hs
                template_config[ZERO_OPTIMIZATION][
-                    ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD] = 10 * hs
+                    'stage3_param_persistence_threshold'] = 10 * hs
            prefix = "z3_"
        else:
            return exps
@@ -355,11 +383,11 @@ class Autotuner:
        logger.debug(f"tuning_keys = {tuning_keys}")
-        logger.debug(f"before prunning total configs = {len(all_configs)}")
+        logger.debug(f"before pruning total configs = {len(all_configs)}")
        pruned_list = prune_configs(all_configs)
-        logger.debug(f"after prunning total configs = {len(pruned_list)}")
+        logger.debug(f"after pruning total configs = {len(pruned_list)}")
        for config in pruned_list:
            exp_config = copy.deepcopy(template_config)
@@ -375,7 +403,6 @@ class Autotuner:
                if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[
                        ZERO_OPTIMIZATION]:
                    del exp_config[ZERO_OPTIMIZATION][OFFLOAD_PARAM]
            # set gradient accumulation steps according to max_train_batch_size_per_gpu
            mbs = exp_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
            gas = max_train_batch_size_per_gpu // mbs
@@ -396,6 +423,10 @@ class Autotuner:
    def tune(self):
        """ Tunes Zero stages, micro batch size per GPU, and other Zero configurations. Performance metrics of different tuning spaces are recorded in self.records.
        """
+        if has_mlflow:
+            self.mlflow_parent_id = os.environ['MLFLOW_RUN_ID']
+            mlflow.start_run(run_id=self.mlflow_parent_id)
        self.start_time = time.time()
        if self.fast_enabled():
            logger.info(f"Fast mode is enabled. Tuning micro batch size only.")
@@ -420,9 +451,11 @@ class Autotuner:
            f"The model requires at least {memory_to_string(self.activation_mem, postfix='B')} activation memory for micro batch size 1."
        )
+        #TODO: FIX THIS
        stage = self.user_config.get(ZERO_OPTIMIZATION,
                                     {}).get(ZERO_OPTIMIZATION_STAGE,
                                             "all")
+        stage = "all"
        user_zero_stages = [stage] if not isinstance(stage, list) else stage
        logger.info(f"User-defined zero stages are {stage}.")
@@ -431,9 +464,9 @@ class Autotuner:
        metric_val = 0
        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZERO_OPTIMIZATION_DISABLED) + self.activation_mem
+            ZeroStageEnum.disabled) + self.activation_mem
        if self.gpu_mem > required_gpu_mem:
-            if "all" in user_zero_stages or ZERO_OPTIMIZATION_DISABLED in user_zero_stages:
+            if "all" in user_zero_stages or ZeroStageEnum.disabled in user_zero_stages:
                logger.info(
                    f"The model might be runable with ZERO 0 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning space"
                )
@@ -443,15 +476,17 @@ class Autotuner:
                    mbs = next_mbs
                    max_mbs = next_max_mbs
                    metric_val = next_metric_val
+                if has_mlflow:
+                    mlflow.log_metric(f"z0{self.metric()}", next_metric_val)
        else:
            logger.info(
-                f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_DISABLED} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
+                f"The model is not runable with ZERO stage {ZeroStageEnum.disabled} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
            )
        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZERO_OPTIMIZATION_OPTIMIZER_STATES) + self.activation_mem
+            ZeroStageEnum.optimizer_states) + self.activation_mem
        if self.gpu_mem > required_gpu_mem:
-            if "all" in user_zero_stages or ZERO_OPTIMIZATION_OPTIMIZER_STATES in user_zero_stages:
+            if "all" in user_zero_stages or ZeroStageEnum.optimizer_states in user_zero_stages:
                logger.info(
                    f"The model might be runable with ZERO 1 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space"
                )
@@ -461,15 +496,17 @@ class Autotuner:
                    mbs = next_mbs
                    max_mbs = next_max_mbs
                    metric_val = next_metric_val
+                if has_mlflow:
+                    mlflow.log_metric(f"z1{self.metric()}", next_metric_val)
        else:
            logger.info(
-                f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_OPTIMIZER_STATES} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
+                f"The model is not runable with ZERO stage {ZeroStageEnum.optimizer_states} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
            )
        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZERO_OPTIMIZATION_GRADIENTS) + self.activation_mem
+            ZeroStageEnum.gradients) + self.activation_mem
        if self.gpu_mem > required_gpu_mem:
-            if "all" in user_zero_stages or ZERO_OPTIMIZATION_GRADIENTS in user_zero_stages:
+            if "all" in user_zero_stages or ZeroStageEnum.gradients in user_zero_stages:
                logger.info(
                    f"The model might be runable with ZERO 2 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning space"
                )
@@ -479,25 +516,31 @@ class Autotuner:
                    mbs = next_mbs
                    max_mbs = next_max_mbs
                    metric_val = next_metric_val
+                if has_mlflow:
+                    mlflow.log_metric(f"z2{self.metric()}", next_metric_val)
        else:
            logger.info(
-                f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_GRADIENTS} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
+                f"The model is not runable with ZERO stage {ZeroStageEnum.gradients} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
            )
        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZERO_OPTIMIZATION_WEIGHTS) + self.activation_mem
+            ZeroStageEnum.weights) + self.activation_mem
        if self.gpu_mem > required_gpu_mem:
-            if "all" in user_zero_stages or ZERO_OPTIMIZATION_WEIGHTS in user_zero_stages:
+            if "all" in user_zero_stages or ZeroStageEnum.weights in user_zero_stages:
                logger.info(
                    f"The model might be runable with ZERO 3 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning space"
                )
-                _, _, _ = self.tune_space(
+                _, _, next_metric_val = self.tune_space(
                    DEFAULT_TUNING_SPACE_ZERO_3, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                if has_mlflow:
+                    mlflow.log_metric(f"z3{self.metric()}", next_metric_val)
        else:
            logger.info(
-                f"The model has {self.get_model_num_params()} parameters and requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory per GPU with DeepSpeed Zero stage {ZERO_OPTIMIZATION_WEIGHTS} optimization. Memory per GPU in system is {memory_to_string(self.gpu_mem)}. No tuning is performed."
+                f"The model has {self.get_model_num_params()} parameters and requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory per GPU with DeepSpeed Zero stage {ZeroStageEnum.weights} optimization. Memory per GPU in system is {memory_to_string(self.gpu_mem)}. No tuning is performed."
            )
            return
+        if has_mlflow:
+            mlflow.end_run()
    def tune_space(self,
                   tuning_space,
@@ -505,7 +548,7 @@ class Autotuner:
                   prev_best_mbs=0,
                   prev_best_metric_val=0):
        config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
-        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, ZERO_OPTIMIZATION_STAGE_DEFAULT)
+        stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
        tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage)
        tuning_micro_batch_sizes = []
        max_train_batch_size_per_gpu = 0
@@ -785,11 +828,12 @@ class Autotuner:
        self.rm.schedule_experiments(exp_paths)
        self.rm.run()
        for exp_id, (exp, err) in self.rm.finished_experiments.items():
            if exp:
                metric_file = exp[DS_CONFIG][AUTOTUNING][AUTOTUNING_METRIC_PATH]
                if os.path.exists(metric_file):
                    with open(metric_file, 'r') as f:
                        results = hjson.load(f)
                        metric_val = results[self.metric()]
@@ -797,11 +841,19 @@ class Autotuner:
                        if max_micro_batch_size == exp[DS_CONFIG][
                                TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
                            max_micro_batch_size_metric_val = metric_val
+                        if has_mlflow:
+                            os.environ.pop('MLFLOW_RUN_ID')
+                            mlflow.start_run(nested=True, run_name=exp['name'])
+                            for metric in results:
+                                mlflow.log_metric(metric, results[metric])
+                            mlflow.end_run()
+                            os.environ['MLFLOW_RUN_ID'] = self.mlflow_parent_id
                else:
                    self.update_records(tuning_space_name, exp, 0, 1)
            else:
                mbs = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]
                logger.info(f"micro batch size = {mbs} was not run successfully")
        self.rm.clear()
        if tuning_micro_batch_sizes_overwritten:
@@ -831,7 +883,18 @@ class Autotuner:
                self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mbs)
            exp, metric_val = self.run_ds_config(ds_config, exp_name)
            if metric_val:
+                with open(metric_file, 'r') as f:
+                    results = hjson.load(f)
+                    metric_val = results[self.metric()]
+                    if has_mlflow:
+                        os.environ.pop('MLFLOW_RUN_ID')
+                        mlflow.start_run(nested=True, run_name=exp_name)
+                        for metric in results:
+                            mlflow.log_metric(metric, results[metric])
+                        mlflow.end_run()
+                        os.environ['MLFLOW_RUN_ID'] = self.mlflow_parent_id
                self.update_records(tuning_space_name, exp, metric_val, 1)
                if metric_val > prev_best_metric_val * (1 + METRIC_PERCENT_DIFF_CONST):
                    prev_best_metric_val = metric_val
@@ -843,7 +906,6 @@ class Autotuner:
                break
        if prev_best_mbs != max_micro_batch_size:
            tuning_micro_batch_sizes[-1] = prev_best_mbs
        return tuning_micro_batch_sizes
    def get_min_max_micro_batch_size(self,
@@ -961,11 +1023,10 @@ class Autotuner:
        low = min_micro_batch_size
        high = max_micro_batch_size
-        while low < high:
+        # binary search until low is the smallest micro batch size that OOMs.
+        while low <= high:
            mid = int((low + high) // 2)
            logger.debug(f"trying mbs = {mid}, low = {low}, high = {high}")
-            if mid == low:
-                break
            if mid not in used_micro_batch_sizes:
                ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mid
                ds_config[TRAIN_BATCH_SIZE] = mid * gas * \
@@ -973,7 +1034,7 @@ class Autotuner:
                exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mid)
                exp, metric_val = self.run_ds_config(ds_config, exp_name)
                if metric_val:
-                    low = mid
+                    low = mid + 1
                    self.update_records(tuning_space_name, exp, metric_val, 1)
                    used_micro_batch_sizes.append(mid)
                    if prev_metric_val and ((metric_val - prev_metric_val) /
@@ -985,8 +1046,8 @@ class Autotuner:
                    self.update_records(tuning_space_name, exp, 0, 1)
                    high = mid - 1
            else:
-                low = mid
+                low = mid + 1
-        max_micro_batch_size = low
+        max_micro_batch_size = low - 1
        logger.info(
            f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}."
@@ -1084,26 +1145,18 @@ class Autotuner:
            json.dump(exp_config, fd)
            fd.flush()
            os.fsync(fd)
        self.rm.schedule_experiments([exp_path])
        self.rm.run()
        exp, metric_val = self.rm.parse_results(self.metric())
        self.rm.clear()
        return exp, metric_val
-    def run_after_tuning(self):
+    def write_optimal_config(self):
-        """ Launches the training with the optmimal DeepSpeed configuration found through the autotuning process.
-            "ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
-        """
        best_space_records = self.get_best_space_records()
        if GLOBAL_TUNING_SPACE not in best_space_records:
            return
        best_exp, best_metric_val, _ = best_space_records[GLOBAL_TUNING_SPACE]
        if best_exp:
-            logger.info(
-                "Start training with the optmimal DeepSpeed configuration found through the tuning process"
-            )
            exp_dir = best_exp["result_dir"]
            cmd = None
            with open(os.path.join(exp_dir, "cmd.txt"), "r") as f:
@@ -1115,18 +1168,27 @@ class Autotuner:
            ds_config_path = os.path.join(self.results_dir, "ds_config_optimal.json")
            json.dump(ds_config, open(ds_config_path, "w"))
-            idx = cmd.index(os.path.join(exp_dir, "ds_config.json"))
-            cmd[idx] = ds_config_path
            cmd_path = os.path.join(self.results_dir, "cmd_optimal.txt")
            with open(cmd_path, "w") as fd:
                fd.write(" ".join(cmd))
                fd.write("\n")
                fd.flush()
+            self.optimal_cmd = cmd
+            self.optmal_ds_config = ds_config
+            logger.info(
+                f"Wrote the optimal DeepSpeed configuration found by autotuning to {ds_config_path}, and the corresponding DeepSpeed command to {cmd_path}"
+            )
-            result = subprocess.Popen(cmd)
+    def run_after_tuning(self):
+        """ Launches the training with the optimal DeepSpeed configuration found through the autotuning process.
+            "ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
+        """
+        if self.optimal_cmd:
+            result = subprocess.Popen(self.optimal_cmd)
            result.wait()
            logger.info(
-                f"Done running with the optimal DeepSpeed configuration found by autotuning: {ds_config_path}"
+                f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}"
            )
+        else:
+            logger.info(f"No optimal DeepSpeed configuration found by autotuning.")
--- a/deepspeed/autotuning/config.py
+++ b/deepspeed/autotuning/config.py
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
@@ -41,11 +42,11 @@ class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
        self.results_dir = get_scalar_param(autotuning_dict,
                                            AUTOTUNING_RESULTS_DIR,
                                            AUTOTUNING_RESULTS_DIR_DEFAULT)
+        assert self.results_dir, "results_dir cannot be empty"
        self.exps_dir = get_scalar_param(autotuning_dict,
                                         AUTOTUNING_EXPS_DIR,
                                         AUTOTUNING_EXPS_DIR_DEFAULT)
+        assert self.exps_dir, "exps_dir cannot be empty"
        self.overwrite = get_scalar_param(autotuning_dict,
                                          AUTOTUNING_OVERWRITE,
                                          AUTOTUNING_OVERWRITE_DEFAULT)

--- a/deepspeed/autotuning/constants.py
+++ b/deepspeed/autotuning/constants.py
+'''Copyright The Microsoft DeepSpeed Team'''
 """
 Copyright (c) Microsoft Corporation
 Licensed under the MIT license.
@@ -22,9 +23,6 @@ DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__f
                                            "config_templates",
                                            "template_zero3.json")
-DEFAULT_EXPRS_DIR = os.path.join(os.getcwd(), "autotuning_exps")
-DEFAULT_RESULTS_DIR = os.path.join(os.getcwd(), "autotuning_results")
 METRIC_PERCENT_DIFF_CONST = 0.05
 DS_CONFIG = "ds_config"
 BUFSIZE = 1  # line buffer size for writing files
@@ -54,10 +52,10 @@ AUTOTUNING_FAST = "fast"
 AUTOTUNING_FAST_DEFAULT = True
 AUTOTUNING_RESULTS_DIR = "results_dir"
-AUTOTUNING_RESULTS_DIR_DEFAULT = None
+AUTOTUNING_RESULTS_DIR_DEFAULT = "autotuning_results"
 AUTOTUNING_EXPS_DIR = "exps_dir"
-AUTOTUNING_EXPS_DIR_DEFAULT = None
+AUTOTUNING_EXPS_DIR_DEFAULT = "autotuning_exps"
 AUTOTUNING_OVERWRITE = "overwrite"
 AUTOTUNING_OVERWRITE_DEFAULT = True

--- a/deepspeed/autotuning/scheduler.py
+++ b/deepspeed/autotuning/scheduler.py
+'''Copyright The Microsoft DeepSpeed Team'''
 import copy
-from re import I
 from numpy import BUFSIZE
-from deepspeed.env_report import SUCCESS
-from enum import Flag
 import json
-import os
 import subprocess
 import sys
 import threading
 import time
-from pathlib import Path
+import base64
-from typing import List
+import os
 import hjson
 from tqdm import tqdm
 from ..utils import logger
-from .constants import *
 from .constants import AUTOTUNING, AUTOTUNING_METRIC_PATH
 from .utils import get_val_by_key, search_error, was_interruptted
 """
@@ -25,9 +22,7 @@ thread-0: loop over experiment queue dispatching experiments if they become avai
 thread-N: start each experiment in its own thread
 """
-import torch.distributed as dist
+from deepspeed import comm as dist
-from datetime import datetime
 TIMEOUT = 5
@@ -188,7 +183,6 @@ class ResourceManager:
                logger.debug(f'Put exp_id = {exp["exp_id"]} back into the queue')
                self.experiment_check(pbar)
            else:
                desc = ""
                for reservation in reservations:
                    reservation.slots.sort()
@@ -344,19 +338,27 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
    exp["job_id"] = get_job_id()
    exp_dir = exp["result_dir"]
    os.makedirs(exp_dir, exist_ok=True)
+    ds_config_path = os.path.join(exp_dir, "ds_config.json")
-    exp["ds_config_path"] = os.path.join(exp_dir, "ds_config.json")
+    exp["ds_config_path"] = ds_config_path
    ds_config = copy.deepcopy(exp["ds_config"])
+    ds_config_json = json.dumps(ds_config).encode('utf-8')
+    exp["ds_config_base64"] = base64.urlsafe_b64encode(ds_config_json).decode('utf-8')
    with open(exp["ds_config_path"], "w", buffering=BUFSIZE) as fd:
        json.dump(ds_config, fd)
        fd.flush()
        os.fsync(fd)
+        path = exp["ds_config_path"]
+        logger.info(f"Scheduler wrote ds_config to {path}, {os.path.abspath(path)}")
    with open(os.path.join(exp_dir, "exp.json"), "w", buffering=BUFSIZE) as fd:
        json.dump(exp, fd)
        fd.flush()
        os.fsync(fd)
+        path = os.path.join(exp_dir, "exp.json")
+        logger.info(f"Scheduler wrote exp to {path}, {os.path.abspath(path)}")
    # remove "--deepspeed_config ds_config.json" from user_args
    if user_args:
@@ -365,9 +367,10 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
        # "--deepspeed_config" is omitted in HF
        elif "--deepspeed" in user_args:
            idx = user_args.index("--deepspeed")
-        assert idx < len(user_args) and ".json" in user_args[idx +
+        assert idx < len(user_args), "there is no ds_config file specified after --deepspeed_config or --deepspeed"
-                                                             1], "there is no ds_config file specified after --deepspeed_config or --deepspeed"
+        # user_args[idx + 1] = exp["ds_config_path"]
-        user_args[idx + 1] = exp["ds_config_path"]
+        # pass base64 serialized ds_config to launcher
+        user_args[idx + 1] = exp["ds_config_base64"]
    exp["user_script"] = user_script
    exp["user_args"] = user_args
@@ -382,7 +385,9 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
        fd.flush()
        os.fsync(fd)
-    logger.info(f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}")
+    logger.info(
+        f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}, and ds_config = {os.path.abspath(ds_config_path)}"
+    )
    with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(
        os.path.join(exp_dir, "stderr.log"), "wb"
@@ -396,7 +401,9 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
    clean_up(exp, reservations)
-    logger.info(f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}")
+    logger.info(
+        f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}"
+    )
 PDSH_MAX_FAN_OUT = 1024

--- a/deepspeed/autotuning/tuner/__init__.py
+++ b/deepspeed/autotuning/tuner/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
 from .index_based_tuner import RandomTuner, GridSearchTuner
 # from .ga_tuner import GATuner
 from .model_based_tuner import ModelBasedTuner
--- a/deepspeed/autotuning/tuner/base_tuner.py
+++ b/deepspeed/autotuning/tuner/base_tuner.py
-import atexit
+'''Copyright The Microsoft DeepSpeed Team'''
 import sys
 from deepspeed.autotuning.constants import *
 from deepspeed.autotuning.utils import write_experiments
 from deepspeed.utils import logger
-import json
 class BaseTuner:
    def __init__(self, exps, resource_manager, metric):

--- a/deepspeed/autotuning/tuner/cost_model.py
+++ b/deepspeed/autotuning/tuner/cost_model.py
-import numpy as np
+'''Copyright The Microsoft DeepSpeed Team'''
 from .utils import *

--- a/deepspeed/autotuning/tuner/index_based_tuner.py
+++ b/deepspeed/autotuning/tuner/index_based_tuner.py
-import random
+'''Copyright The Microsoft DeepSpeed Team'''
-from deepspeed.utils import logger
+import random
 from .base_tuner import BaseTuner

--- a/deepspeed/autotuning/tuner/model_based_tuner.py
+++ b/deepspeed/autotuning/tuner/model_based_tuner.py
+'''Copyright The Microsoft DeepSpeed Team'''
 import hjson
-import numpy as np
-from deepspeed.utils import logger
-from ..constants import AUTOTUNING, AUTOTUNING_METRIC_PATH, AUTOTUNING_METRIC_DEFAULT
+from ..constants import AUTOTUNING, AUTOTUNING_METRIC_PATH
 from .base_tuner import BaseTuner
 from .cost_model import XGBoostCostModel
 from .utils import *

--- a/deepspeed/autotuning/tuner/utils.py
+++ b/deepspeed/autotuning/tuner/utils.py
+'''Copyright The Microsoft DeepSpeed Team'''
 import numpy as np
 import itertools
 from ..utils import *

--- a/deepspeed/autotuning/utils.py
+++ b/deepspeed/autotuning/utils.py
+'''Copyright The Microsoft DeepSpeed Team'''
 import re
 import collections.abc
 import os
 import json
 from deepspeed.runtime.constants import GRADIENT_ACCUMULATION_STEPS, TRAIN_MICRO_BATCH_SIZE_PER_GPU
-import hjson
-import sys
 import itertools
 import copy
@@ -35,23 +35,11 @@ def was_interruptted(filename):
    return False
-def was_interruptted(filename):
-    if not os.path.exists(filename):
-        return "stderr.log does not exist"
-    with open(filename) as f:
-        for line in f:
-            s = "KeyboardInterrupt"
-            idx = line.find(s)
-            if idx != -1:
-                return True
-    return False
 def find_replace_str(value, replace_dict):
    if not isinstance(value, str):
        return str(value)
-    matches = re.findall("\$[A-Za-z0-9_]+", value)
+    matches = re.findall(r"\$[A-Za-z0-9_]+", value)
    for var in matches:
        var_key = var.replace("$", "").lower()
        if var_key == "nvme_path":