Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
#pragma once #pragma once
#ifdef __HIP_PLATFORM_HCC__ #include "ds_kernel_utils.h"
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <cuda.h> #include <cuda.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
...@@ -19,12 +15,17 @@ ...@@ -19,12 +15,17 @@
#define MAX_WARP_NUM 32 #define MAX_WARP_NUM 32
#define WARP_SIZE 32 #define WARP_SIZE 32
#define MAX_THREADS 1024
#define SMs 80 #define SMs 80
#define MAX_REGISTERS 256 #define MAX_REGISTERS 256
template <typename T> template <typename T>
void launch_attn_softmax_v2(T* vals, void launch_attn_softmax_v2(T* vals,
T* mask, T* mask,
T* alibi,
float layer_scale,
bool triangular, bool triangular,
bool recompute, bool recompute,
bool local_attention, bool local_attention,
...@@ -33,7 +34,9 @@ void launch_attn_softmax_v2(T* vals, ...@@ -33,7 +34,9 @@ void launch_attn_softmax_v2(T* vals,
int heads, int heads,
int num_seq, int num_seq,
int sequence_length, int sequence_length,
float scale, int offset,
int mask_stride,
int mp_size,
cudaStream_t stream); cudaStream_t stream);
// Fused bias add with gelu activation // Fused bias add with gelu activation
...@@ -43,6 +46,23 @@ void launch_bias_gelu(T* input, ...@@ -43,6 +46,23 @@ void launch_bias_gelu(T* input,
int intermediate_size, int intermediate_size,
int batch_size, int batch_size,
cudaStream_t stream); cudaStream_t stream);
template <typename T>
void launch_fused_bias_geglu(T* output,
const T* activation,
const T* bias,
int rows,
int elems_per_row,
cudaStream_t stream);
// Fused bias add with relu activation
template <typename T>
void launch_bias_relu(T* input,
const T* bias,
int intermediate_size,
int batch_size,
cudaStream_t stream);
template <typename T> template <typename T>
void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream); void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
...@@ -55,32 +75,44 @@ void launch_bias_residual(T* input, ...@@ -55,32 +75,44 @@ void launch_bias_residual(T* input,
int batch, int batch,
int hidden_dim, int hidden_dim,
int mp_size, int mp_size,
bool preln,
cudaStream_t stream); cudaStream_t stream);
template <typename T> template <typename T>
void launch_layer_norm(T* out, void launch_fused_ln(T* output,
T* vals, const T* vals,
const T* gamma, const T* gamma,
const T* beta, const T* beta,
float epsilon, float epsilon,
int batch_size, int rows,
int hidden_dim, int elems_per_row,
cudaStream_t stream); cudaStream_t stream);
template <typename T> template <typename T>
void launch_residual_layer_norm(T* norm, void launch_fused_residual_ln(T* output,
T* res_add, const T* vals,
T* vals, const T* residual,
T* residual, const T* bias,
const T* bias, const T* gamma,
const T* gamma, const T* beta,
const T* beta, float epsilon,
float epsilon, int rows,
int batch_size, int elems_per_row,
int hidden_dim, cudaStream_t stream);
bool preLN,
bool mlp_after_attn, template <typename T>
cudaStream_t stream); void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
T* res_output,
const T* vals,
const T* residual,
const T* bias,
const T* gamma,
const T* beta,
float epsilon,
int rows,
int elems_per_row,
cudaStream_t stream);
template <typename T> template <typename T>
void launch_dequantize(T* output, void launch_dequantize(T* output,
const int8_t* input, const int8_t* input,
...@@ -92,6 +124,14 @@ void launch_dequantize(T* output, ...@@ -92,6 +124,14 @@ void launch_dequantize(T* output,
cudaStream_t stream); cudaStream_t stream);
template <typename T> template <typename T>
void launch_dequantize(T* output,
const int8_t* input,
const float* qscale,
unsigned output_size,
unsigned hidden_dim,
unsigned groups,
cudaStream_t stream);
template <typename T>
void launch_gptj_residual_add(T* input, void launch_gptj_residual_add(T* input,
T* output, T* output,
T* attn, T* attn,
...@@ -113,7 +153,8 @@ void launch_apply_rotary_pos_emb(T* mixed_query, ...@@ -113,7 +153,8 @@ void launch_apply_rotary_pos_emb(T* mixed_query,
unsigned batch, unsigned batch,
bool rotate_half, bool rotate_half,
bool rotate_every_two, bool rotate_every_two,
cudaStream_t stream); cudaStream_t stream,
int max_out_tokens);
template <typename T> template <typename T>
void launch_moe_res_matmul(T* residual, void launch_moe_res_matmul(T* residual,
...@@ -122,3 +163,60 @@ void launch_moe_res_matmul(T* residual, ...@@ -122,3 +163,60 @@ void launch_moe_res_matmul(T* residual,
int seq_len, int seq_len,
int hidden_dim, int hidden_dim,
cudaStream_t stream); cudaStream_t stream);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template <typename T>
void launch_transform4d_0213(T* out,
const T* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
cudaStream_t stream,
int trans_count);
template <typename T>
void launch_bias_add_transform_0213(T* outputs,
T* vals,
T* vals1,
const T* vals2,
const T* bias,
int batch_size,
int seq_length,
unsigned seq_offset,
int seq_length1,
int hidden_dim,
int heads,
int rotary_dim,
bool rotate_half,
bool rotate_every_two,
cudaStream_t stream,
int trans_count,
int max_out_tokens);
template <typename T>
void pad_data(T* padded_output,
T* output,
int bsz,
int head_size,
int padded_head_size,
cudaStream_t stream);
template <typename T>
void pad_head_seq(T* padded_output,
T* output,
int bsz,
int seq_len,
int padded_seq_len,
int head_size,
int padded_head_size,
cudaStream_t stream);
template <typename T>
void launch_pad_add_transform_0213(T* output,
const T* vals,
int batch_size,
int hidden_dim,
int seq_length,
int padded_seq_len,
int heads,
int padded_head_size,
cudaStream_t stream);
// !!! This is a file automatically generated by hipify!!! // !!! This is a file automatically generated by hipify!!!
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
#pragma once #pragma once
#ifdef __HIP_PLATFORM_HCC__ #include "ds_kernel_utils_hip.h"
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#include <hip/hip_fp16.h> #include <hip/hip_fp16.h>
...@@ -20,12 +16,17 @@ ...@@ -20,12 +16,17 @@
#define MAX_WARP_NUM 32 #define MAX_WARP_NUM 32
#define WARP_SIZE 32 #define WARP_SIZE 32
#define MAX_THREADS 1024
#define SMs 80 #define SMs 80
#define MAX_REGISTERS 256 #define MAX_REGISTERS 256
template <typename T> template <typename T>
void launch_attn_softmax_v2(T* vals, void launch_attn_softmax_v2(T* vals,
T* mask, T* mask,
T* alibi,
float layer_scale,
bool triangular, bool triangular,
bool recompute, bool recompute,
bool local_attention, bool local_attention,
...@@ -34,7 +35,9 @@ void launch_attn_softmax_v2(T* vals, ...@@ -34,7 +35,9 @@ void launch_attn_softmax_v2(T* vals,
int heads, int heads,
int num_seq, int num_seq,
int sequence_length, int sequence_length,
float scale, int offset,
int mask_stride,
int mp_size,
hipStream_t stream); hipStream_t stream);
// Fused bias add with gelu activation // Fused bias add with gelu activation
...@@ -44,6 +47,23 @@ void launch_bias_gelu(T* input, ...@@ -44,6 +47,23 @@ void launch_bias_gelu(T* input,
int intermediate_size, int intermediate_size,
int batch_size, int batch_size,
hipStream_t stream); hipStream_t stream);
template <typename T>
void launch_fused_bias_geglu(T* output,
const T* activation,
const T* bias,
int rows,
int elems_per_row,
hipStream_t stream);
// Fused bias add with relu activation
template <typename T>
void launch_bias_relu(T* input,
const T* bias,
int intermediate_size,
int batch_size,
hipStream_t stream);
template <typename T> template <typename T>
void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream); void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream);
...@@ -56,32 +76,44 @@ void launch_bias_residual(T* input, ...@@ -56,32 +76,44 @@ void launch_bias_residual(T* input,
int batch, int batch,
int hidden_dim, int hidden_dim,
int mp_size, int mp_size,
bool preln,
hipStream_t stream); hipStream_t stream);
template <typename T> template <typename T>
void launch_layer_norm(T* out, void launch_fused_ln(T* output,
T* vals, const T* vals,
const T* gamma, const T* gamma,
const T* beta, const T* beta,
float epsilon, float epsilon,
int batch_size, int rows,
int hidden_dim, int elems_per_row,
hipStream_t stream); hipStream_t stream);
template <typename T> template <typename T>
void launch_residual_layer_norm(T* norm, void launch_fused_residual_ln(T* output,
T* res_add, const T* vals,
T* vals, const T* residual,
T* residual, const T* bias,
const T* bias, const T* gamma,
const T* gamma, const T* beta,
const T* beta, float epsilon,
float epsilon, int rows,
int batch_size, int elems_per_row,
int hidden_dim, hipStream_t stream);
bool preLN,
bool mlp_after_attn, template <typename T>
hipStream_t stream); void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
T* res_output,
const T* vals,
const T* residual,
const T* bias,
const T* gamma,
const T* beta,
float epsilon,
int rows,
int elems_per_row,
hipStream_t stream);
template <typename T> template <typename T>
void launch_dequantize(T* output, void launch_dequantize(T* output,
const int8_t* input, const int8_t* input,
...@@ -93,6 +125,14 @@ void launch_dequantize(T* output, ...@@ -93,6 +125,14 @@ void launch_dequantize(T* output,
hipStream_t stream); hipStream_t stream);
template <typename T> template <typename T>
void launch_dequantize(T* output,
const int8_t* input,
const float* qscale,
unsigned output_size,
unsigned hidden_dim,
unsigned groups,
hipStream_t stream);
template <typename T>
void launch_gptj_residual_add(T* input, void launch_gptj_residual_add(T* input,
T* output, T* output,
T* attn, T* attn,
...@@ -114,7 +154,8 @@ void launch_apply_rotary_pos_emb(T* mixed_query, ...@@ -114,7 +154,8 @@ void launch_apply_rotary_pos_emb(T* mixed_query,
unsigned batch, unsigned batch,
bool rotate_half, bool rotate_half,
bool rotate_every_two, bool rotate_every_two,
hipStream_t stream); hipStream_t stream,
int max_out_tokens);
template <typename T> template <typename T>
void launch_moe_res_matmul(T* residual, void launch_moe_res_matmul(T* residual,
...@@ -123,3 +164,60 @@ void launch_moe_res_matmul(T* residual, ...@@ -123,3 +164,60 @@ void launch_moe_res_matmul(T* residual,
int seq_len, int seq_len,
int hidden_dim, int hidden_dim,
hipStream_t stream); hipStream_t stream);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template <typename T>
void launch_transform4d_0213(T* out,
const T* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
hipStream_t stream,
int trans_count);
template <typename T>
void launch_bias_add_transform_0213(T* outputs,
T* vals,
T* vals1,
const T* vals2,
const T* bias,
int batch_size,
int seq_length,
unsigned seq_offset,
int seq_length1,
int hidden_dim,
int heads,
int rotary_dim,
bool rotate_half,
bool rotate_every_two,
hipStream_t stream,
int trans_count,
int max_out_tokens);
template <typename T>
void pad_data(T* padded_output,
T* output,
int bsz,
int head_size,
int padded_head_size,
hipStream_t stream);
template <typename T>
void pad_head_seq(T* padded_output,
T* output,
int bsz,
int seq_len,
int padded_seq_len,
int head_size,
int padded_head_size,
hipStream_t stream);
template <typename T>
void launch_pad_add_transform_0213(T* output,
const T* vals,
int batch_size,
int hidden_dim,
int seq_length,
int padded_seq_len,
int heads,
int padded_head_size,
hipStream_t stream);
/*
Copyright The Microsoft DeepSpeed Team
*/
#include "custom_cuda_layers.h" #include "custom_cuda_layers.h"
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
...@@ -862,6 +866,7 @@ __global__ void LayerNormBackward2(const __half* out_grad, ...@@ -862,6 +866,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
bool invertible, bool invertible,
int row_stride) int row_stride)
{ {
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x; int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride; int iterations = row_stride / iteration_stride;
...@@ -985,6 +990,7 @@ __global__ void LayerNormBackward2(const __half* out_grad, ...@@ -985,6 +990,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
inp_grad_h[high_index] = temp; inp_grad_h[high_index] = temp;
} }
#endif
} }
template <> template <>
...@@ -1172,6 +1178,7 @@ __global__ void LayerNormBackward2(const __half* out_grad, ...@@ -1172,6 +1178,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
__half* inp_grad, __half* inp_grad,
int row_stride) int row_stride)
{ {
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x; int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride; int iterations = row_stride / iteration_stride;
...@@ -1290,6 +1297,7 @@ __global__ void LayerNormBackward2(const __half* out_grad, ...@@ -1290,6 +1297,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
__half2 temp = __float22half2_rn(vals_arr_f[iterations]); __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
inp_grad_h[high_index] = temp; inp_grad_h[high_index] = temp;
} }
#endif
} }
template <> template <>
...@@ -1601,6 +1609,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1, ...@@ -1601,6 +1609,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
bool invertible, bool invertible,
int row_stride) int row_stride)
{ {
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x; int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride; int iterations = row_stride / iteration_stride;
...@@ -1727,6 +1736,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1, ...@@ -1727,6 +1736,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
inp_grad_h[high_index] = temp + out_grad_h2[high_index]; inp_grad_h[high_index] = temp + out_grad_h2[high_index];
} }
#endif
} }
template <> template <>
...@@ -1922,6 +1932,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1, ...@@ -1922,6 +1932,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
__half* inp_grad, __half* inp_grad,
int row_stride) int row_stride)
{ {
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x; int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride; int iterations = row_stride / iteration_stride;
...@@ -2044,6 +2055,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1, ...@@ -2044,6 +2055,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
__half2 temp = __float22half2_rn(vals_arr_f[iterations]); __half2 temp = __float22half2_rn(vals_arr_f[iterations]);
inp_grad_h[high_index] = temp + out_grad_h2[high_index]; inp_grad_h[high_index] = temp + out_grad_h2[high_index];
} }
#endif
} }
template <> template <>
......
/*
Copyright The Microsoft DeepSpeed Team
*/
#include <math.h> #include <math.h>
#include "custom_cuda_layers.h" #include "custom_cuda_layers.h"
#include "general_kernels.h" #include "general_kernels.h"
...@@ -536,6 +540,102 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/, ...@@ -536,6 +540,102 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
} }
} }
__global__ void softmax_backward_kernel_arbitrary_length(__half* grad /* input & output*/,
const __half* output,
int softmax_length)
{
int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
int offset = batch_idx * softmax_length + threadIdx.x;
const float4* output_cast = reinterpret_cast<const float4*>(output);
float4* grad_cast = reinterpret_cast<float4*>(grad);
grad_cast += offset;
output_cast += offset;
float sum = 0.0;
int curr_idx = threadIdx.x;
while (curr_idx < softmax_length) {
float4 out_reg = output_cast[curr_idx];
float4 grad_reg = grad_cast[curr_idx];
__half2* out_h = reinterpret_cast<__half2*>(&out_reg);
__half2* grad_h = reinterpret_cast<__half2*>(&grad_reg);
#pragma unroll
for (int m = 0; m < 4; m++) grad_h[m] *= out_h[m];
sum += ((float)grad_h[0].x + (float)grad_h[0].y + (float)grad_h[1].x + (float)grad_h[1].y) +
((float)grad_h[2].x + (float)grad_h[2].y + (float)grad_h[3].x + (float)grad_h[3].y);
curr_idx += WARP_SIZE;
}
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#pragma unroll
for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
curr_idx = threadIdx.x;
while (curr_idx < softmax_length) {
float4 out_reg = output_cast[curr_idx];
float4 grad_reg = grad_cast[curr_idx];
__half* grad_h = reinterpret_cast<__half*>(&grad_reg);
__half* out_h = reinterpret_cast<__half*>(&out_reg);
#pragma unroll
for (int m = 0; m < 8; m++) grad_h[m] = (float)out_h[m] * ((float)grad_h[m] - sum);
grad_cast[curr_idx] = grad_reg;
curr_idx += WARP_SIZE;
}
}
__global__ void softmax_backward_kernel_arbitrary_length(float* grad /* input & output*/,
const float* output,
int softmax_length)
{
int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
int offset = batch_idx * softmax_length + threadIdx.x;
const float4* output_cast = reinterpret_cast<const float4*>(output);
float4* grad_cast = reinterpret_cast<float4*>(grad);
grad_cast += offset;
output_cast += offset;
float sum = 0.0;
int curr_idx = threadIdx.x;
while (curr_idx < softmax_length) {
float4 out_reg = output_cast[curr_idx];
float4 grad_reg = grad_cast[curr_idx];
grad_reg.x *= out_reg.x;
grad_reg.y *= out_reg.y;
grad_reg.z *= out_reg.z;
grad_reg.w *= out_reg.w;
sum += (grad_reg.x + grad_reg.y + grad_reg.z + grad_reg.w);
curr_idx += WARP_SIZE;
}
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#pragma unroll
for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
curr_idx = threadIdx.x;
while (curr_idx < softmax_length) {
float4 out_reg = output_cast[curr_idx];
float4 grad_reg = grad_cast[curr_idx];
grad_reg.x = out_reg.x * (grad_reg.x - sum);
grad_reg.y = out_reg.y * (grad_reg.y - sum);
grad_reg.z = out_reg.z * (grad_reg.z - sum);
grad_reg.w = out_reg.w * (grad_reg.w - sum);
grad_cast[curr_idx] = grad_reg;
curr_idx += WARP_SIZE;
}
}
template <typename T> template <typename T>
void launch_attn_softmax_backward_v2(T* out_grad, void launch_attn_softmax_backward_v2(T* out_grad,
const T* soft_inp, const T* soft_inp,
...@@ -575,10 +675,15 @@ void launch_attn_softmax_backward_v2(T* out_grad, ...@@ -575,10 +675,15 @@ void launch_attn_softmax_backward_v2(T* out_grad,
else if (seq_length <= 2048) else if (seq_length <= 2048)
softmax_backward_kernel_v2<T, 64> softmax_backward_kernel_v2<T, 64>
<<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length); <<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
else if (seq_length <= 4096)
softmax_backward_kernel_v2<T, 128>
<<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
else if (seq_length <= 8192)
softmax_backward_kernel_v2<T, 256>
<<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
else else
throw std::runtime_error( softmax_backward_kernel_arbitrary_length<<<grid_dim, block_dim, 0, stream>>>(
std::string("Special sequence length found in softmax backward, seq_length: ") + out_grad, soft_inp, seq_length / (4 << ((sizeof(T) & 2) >> 1)));
std::to_string(seq_length));
} }
template void launch_attn_softmax_backward_v2<__half>(__half* out_grad, template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,
......
/*
Copyright The Microsoft DeepSpeed Team
*/
#include "custom_cuda_layers.h" #include "custom_cuda_layers.h"
#define rows_trans 16 #define rows_trans 16
......
...@@ -4,6 +4,7 @@ Copyright 2020 The Microsoft DeepSpeed Team ...@@ -4,6 +4,7 @@ Copyright 2020 The Microsoft DeepSpeed Team
import sys import sys
import types import types
import json
from typing import Optional, Union from typing import Optional, Union
import torch import torch
from torch.optim import Optimizer from torch.optim import Optimizer
...@@ -17,17 +18,18 @@ from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpe ...@@ -17,17 +18,18 @@ from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpe
from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
from .runtime.pipe.engine import PipelineEngine from .runtime.pipe.engine import PipelineEngine
from .inference.engine import InferenceEngine from .inference.engine import InferenceEngine
from .inference.config import DeepSpeedInferenceConfig
from .runtime.lr_schedules import add_tuning_arguments from .runtime.lr_schedules import add_tuning_arguments
from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError
from .runtime.activation_checkpointing import checkpointing from .runtime.activation_checkpointing import checkpointing
from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
from .module_inject import replace_transformer_layer, revert_transformer_layer from .module_inject import replace_transformer_layer, revert_transformer_layer
from .utils import log_dist from .utils import log_dist, OnDevice
from .utils.distributed import init_distributed from .comm.comm import init_distributed
from .runtime import zero from .runtime import zero
from .runtime import DeepSpeedOptimizer, ZeROOptimizer
from .pipe import PipelineModule from .pipe import PipelineModule
...@@ -82,7 +84,7 @@ def initialize(args=None, ...@@ -82,7 +84,7 @@ def initialize(args=None,
mpu: Optional: A model parallelism unit object that implements mpu: Optional: A model parallelism unit object that implements
get_{model,data}_parallel_{rank,group,world_size}() get_{model,data}_parallel_{rank,group,world_size}()
dist_init_required: Optional: None will auto-initialize torch.distributed if needed, dist_init_required: Optional: None will auto-initialize torch distributed if needed,
otherwise the user can force it to be initialized or not via boolean. otherwise the user can force it to be initialized or not via boolean.
collate_fn: Optional: Merges a list of samples to form a collate_fn: Optional: Merges a list of samples to form a
...@@ -113,6 +115,10 @@ def initialize(args=None, ...@@ -113,6 +115,10 @@ def initialize(args=None,
__git_hash__, __git_hash__,
__git_branch__), __git_branch__),
ranks=[0]) ranks=[0])
# Disable zero.Init context if it's currently enabled
zero.partition_parameters.shutdown_init_context()
assert model is not None, "deepspeed.initialize requires a model" assert model is not None, "deepspeed.initialize requires a model"
if not isinstance(model, PipelineModule): if not isinstance(model, PipelineModule):
...@@ -217,61 +223,57 @@ def add_config_arguments(parser): ...@@ -217,61 +223,57 @@ def add_config_arguments(parser):
return parser return parser
def init_inference(model, def default_inference_config():
triangular_masking=True, """
mp_size=1, Return a default DeepSpeed inference configuration dictionary.
training_mp_size=1, """
mpu=None, return DeepSpeedInferenceConfig().dict()
ep_group=None,
expert_mp_group=None,
checkpoint=None, def init_inference(model, config=None, **kwargs):
dtype=None,
injection_policy=None,
replace_method='auto',
quantization_setting=None,
replace_with_kernel_inject=False,
return_tuple=True,
ep_size=1,
moe=False,
moe_experts=1,
moe_type='standard',
args=None):
"""Initialize the DeepSpeed InferenceEngine. """Initialize the DeepSpeed InferenceEngine.
Arguments: Description: all four cases are valid and supported in DS init_inference() API.
model: Required: nn.module class before apply any wrappers
triangular_masking: Required: this shows the type of masking for attention scores in transformer layer # Case 1: user provides no config and no kwargs. Default config will be used.
note that the masking is application specific.
mp_size: Optional: Desired model parallel size, default is 1 meaning no .. code-block:: python
model parallelism.
training_mp_size: Optional: if loading a checkpoint this is the mp size that it was trained with, generator.model = deepspeed.init_inference(generator.model)
it may be different than what the mp size that you want to use during inference. string = generator("DeepSpeed is")
print(string)
mpu: Optional: A model parallelism unit object that implements # Case 2: user provides a config and no kwargs. User supplied config will be used.
get_{model,data}_parallel_{rank,group,world_size}()
.. code-block:: python
generator.model = deepspeed.init_inference(generator.model, config=config)
string = generator("DeepSpeed is")
print(string)
# Case 3: user provides no config and uses keyword arguments (kwargs) only.
checkpoint: Optional: Path to deepspeed compatible checkpoint or path to .. code-block:: python
JSON with load policy.
dtype: Optional: Desired model data type, will convert model to this type. generator.model = deepspeed.init_inference(generator.model,
Supported target types: torch.half, torch.int8, torch.float mp_size=world_size,
dtype=torch.half,
replace_with_kernel_inject=True)
string = generator("DeepSpeed is")
print(string)
injection_policy: Optional: Dictionary mapping a client nn.Module to its corresponding # Case 4: user provides config and keyword arguments (kwargs). Both config and kwargs are merged and kwargs take precedence.
injection policy. e.g., {BertLayer : deepspeed.inference.HFBertLayerPolicy}
replace_method: Optional: If 'auto' DeepSpeed will automatically try and replace .. code-block:: python
model modules with its optimized versions. If an injection_policy is set this will
override the automatic replacement behavior.
quantization_setting: Optional: Quantization settings used for quantizing your model using the MoQ. generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
The setting can be one element or a tuple. If one value is passed in, we consider it as the number string = generator("DeepSpeed is")
of groups used in quantization. A tuple is passed in if we want to mention that there is extra-grouping print(string)
for the MLP part of a Transformer layer (e.g. (True, 8) shows we quantize the model using 8 groups for
all the network except the MLP part that we use 8 extra grouping). Arguments:
replace_with_kernel_inject: If set we inject kernel as we initialize the inference-engine model: Required: original nn.module object without any wrappers
config: Optional: instead of arguments, you can pass in a DS inference config dict or path to JSON file
Returns: Returns:
A deepspeed.InferenceEngine wrapped model. A deepspeed.InferenceEngine wrapped model.
...@@ -282,24 +284,30 @@ def init_inference(model, ...@@ -282,24 +284,30 @@ def init_inference(model,
__git_branch__), __git_branch__),
ranks=[0]) ranks=[0])
engine = InferenceEngine(model, # Load config_dict from config first
triangular_masking, if config is None:
mp_size, config = {}
training_mp_size, if isinstance(config, str):
ep_size, with open(config, "r") as f:
mpu, config_dict = json.load(f)
ep_group, elif isinstance(config, dict):
expert_mp_group, config_dict = config
checkpoint, else:
dtype, raise ValueError(
injection_policy, f"'config' argument expected string or dictionary, got {type(config)}")
return_tuple,
replace_method, # Update with values from kwargs, ensuring no conflicting overlap between config and kwargs
quantization_setting, overlap_keys = set(config_dict.keys()).intersection(kwargs.keys())
replace_with_kernel_inject, # If there is overlap, error out if values are different
moe, for key in overlap_keys:
moe_experts, if config_dict[key] != kwargs[key]:
moe_type, raise ValueError(
args) f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}"
)
config_dict.update(kwargs)
ds_inference_config = DeepSpeedInferenceConfig(**config_dict)
engine = InferenceEngine(model, config=ds_inference_config)
return engine return engine
../accelerator/
\ No newline at end of file
File mode changed from 100644 to 100755
'''Copyright The Microsoft DeepSpeed Team'''
from .autotuner import Autotuner from .autotuner import Autotuner
import copy '''Copyright The Microsoft DeepSpeed Team'''
import json
import os
from random import sample
import shutil import shutil
import subprocess import subprocess
import hjson
import torch
import time import time
import datetime import datetime
import math import math
import hjson
from ..runtime.config_utils import dict_raise_error_on_duplicate_keys from ..runtime.config_utils import dict_raise_error_on_duplicate_keys
from ..runtime.constants import * from ..runtime.constants import *
from ..runtime.zero.constants import *
from ..runtime.zero.config import ZERO_OPTIMIZATION, ZeroStageEnum
from ..utils import logger from ..utils import logger
from .config import DeepSpeedAutotuningConfig from .config import DeepSpeedAutotuningConfig
from .constants import * from .constants import *
from .scheduler import ResourceManager, run_experiment from .scheduler import ResourceManager
from .tuner import GridSearchTuner, RandomTuner, ModelBasedTuner from .tuner import GridSearchTuner, RandomTuner, ModelBasedTuner
from .utils import * from .utils import *
from deepspeed.accelerator import get_accelerator
try: try:
from tabulate import tabulate from tabulate import tabulate
except ImportError: except ImportError:
tabulate = None tabulate = None
try:
import mlflow
has_mlflow = True
except Exception as e:
has_mlflow = False
ZERO_OPTIMIZATION_STAGE = "stage"
OFFLOAD_OPTIMIZER = "offload_optimizer"
OFFLOAD_PARAM = "offload_param"
ZERO_OPTIMIZATION_STAGE_DEFAULT = ZeroStageEnum.disabled
class Autotuner: class Autotuner:
"""The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods. """The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods.
...@@ -42,22 +52,37 @@ class Autotuner: ...@@ -42,22 +52,37 @@ class Autotuner:
assert self.user_config is not None, "DeepSpeed configuration is not provided" assert self.user_config is not None, "DeepSpeed configuration is not provided"
self.autotuning_config = DeepSpeedAutotuningConfig(self.user_config) self.autotuning_config = DeepSpeedAutotuningConfig(self.user_config)
if self.user_config[AUTOTUNING]:
if AUTOTUNING_EXPS_DIR in self.user_config[AUTOTUNING].keys():
del self.user_config[AUTOTUNING][AUTOTUNING_EXPS_DIR]
if AUTOTUNING_RESULTS_DIR in self.user_config[AUTOTUNING].keys():
del self.user_config[AUTOTUNING][AUTOTUNING_RESULTS_DIR]
self.exps_dir = DEFAULT_EXPRS_DIR self.exps_dir = self.autotuning_config.exps_dir
if self.autotuning_config.exps_dir and self.autotuning_config.exps_dir != "":
self.exps_dir = self.autotuning_config.exps_dir
if self.autotuning_config.overwrite and os.path.exists(self.exps_dir): if self.autotuning_config.overwrite and os.path.exists(self.exps_dir):
shutil.rmtree(self.exps_dir, ignore_errors=True) shutil.rmtree(self.exps_dir, ignore_errors=True)
if not os.path.exists(self.exps_dir): if not os.path.exists(self.exps_dir):
os.makedirs(self.exps_dir, exist_ok=True) try:
os.makedirs(self.exps_dir, exist_ok=True)
logger.info(f"Created autotuning experiments directory: {self.exps_dir}")
except:
logger.error(
f"Failed to create {self.exps_dir}, please check `exps_dir` in the autotuning config file is accessible by all the nodes in the job."
)
exit(-1)
self.results_dir = DEFAULT_RESULTS_DIR self.results_dir = self.autotuning_config.results_dir
if self.autotuning_config.results_dir and self.autotuning_config.results_dir != "":
self.results_dir = self.autotuning_config.results_dir
if self.autotuning_config.overwrite and os.path.exists(self.results_dir): if self.autotuning_config.overwrite and os.path.exists(self.results_dir):
shutil.rmtree(self.results_dir, ignore_errors=True) shutil.rmtree(self.results_dir, ignore_errors=True)
if not os.path.exists(self.results_dir): if not os.path.exists(self.results_dir):
os.makedirs(self.results_dir, exist_ok=True) try:
os.makedirs(self.results_dir, exist_ok=True)
logger.info(f"Created autotuning resutls directory: {self.exps_dir}")
except:
logger.error(
f"Failed to create {self.results_dir}, please check `results_dir` in the autotuning config file is accessible by all the nodes in the job."
)
exit(-1)
# set the active resource for the autotuner resource manager # set the active resource for the autotuner resource manager
self.rm = self._get_resource_manager(active_resources) self.rm = self._get_resource_manager(active_resources)
...@@ -70,6 +95,10 @@ class Autotuner: ...@@ -70,6 +95,10 @@ class Autotuner:
self.rm.nodes), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any" self.rm.nodes), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
self.records = {} self.records = {}
self.optimal_cmd = None
self.optmal_ds_config = None
self.mlflow_parent_id = None
def print_tuning_results(self): def print_tuning_results(self):
"""Print the autotuning results in tabular format. """Print the autotuning results in tabular format.
...@@ -252,7 +281,7 @@ class Autotuner: ...@@ -252,7 +281,7 @@ class Autotuner:
return False return False
def get_gpu_memory_info(self): def get_gpu_memory_info(self):
return torch.cuda.get_device_properties(0).total_memory return get_accelerator().total_memory()
def get_activation_memory_per_gpu(self): def get_activation_memory_per_gpu(self):
if self.model_info and "activation_mem_per_gpu" in self.model_info: if self.model_info and "activation_mem_per_gpu" in self.model_info:
...@@ -266,18 +295,18 @@ class Autotuner: ...@@ -266,18 +295,18 @@ class Autotuner:
if not num_params: if not num_params:
return 0 return 0
# assume the model uses Adam optimizer # assume the model uses Adam optimizer
# ZERO_OPTIMIZATION_DISABLED: # ZeroStageEnum.disabled:
params_mem = num_params * (2 if fp16_enabled else 4) params_mem = num_params * (2 if fp16_enabled else 4)
gradients_mem = num_params * (2 if fp16_enabled else 4) gradients_mem = num_params * (2 if fp16_enabled else 4)
optimizer_mem = num_params * (16 if fp16_enabled else 8) optimizer_mem = num_params * (16 if fp16_enabled else 8)
if zero_stage >= ZERO_OPTIMIZATION_OPTIMIZER_STATES: if zero_stage >= ZeroStageEnum.optimizer_states:
optimizer_mem = optimizer_mem / total_gpus optimizer_mem = optimizer_mem / total_gpus
if zero_stage >= ZERO_OPTIMIZATION_GRADIENTS: if zero_stage >= ZeroStageEnum.gradients:
gradients_mem = gradients_mem / total_gpus gradients_mem = gradients_mem / total_gpus
if zero_stage >= ZERO_OPTIMIZATION_WEIGHTS: if zero_stage >= ZeroStageEnum.weights:
params_mem = params_mem / total_gpus params_mem = params_mem / total_gpus
mem_per_gpu = (params_mem + gradients_mem + optimizer_mem) / self.mp_size() mem_per_gpu = (params_mem + gradients_mem + optimizer_mem) / self.mp_size()
...@@ -308,7 +337,7 @@ class Autotuner: ...@@ -308,7 +337,7 @@ class Autotuner:
# each zero stage uses a different template configuration file # each zero stage uses a different template configuration file
config_zero = tuning_space.get(ZERO_OPTIMIZATION, {}) config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None) stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, ZERO_OPTIMIZATION_STAGE_DEFAULT)
template_config = {} template_config = {}
if stage == 0: if stage == 0:
template_path = DEFAULT_TEMPLATE_PATH_ZERO_0 template_path = DEFAULT_TEMPLATE_PATH_ZERO_0
...@@ -331,12 +360,11 @@ class Autotuner: ...@@ -331,12 +360,11 @@ class Autotuner:
model_info = self.model_info model_info = self.model_info
if model_info and "hidden_size" in model_info: if model_info and "hidden_size" in model_info:
hs = model_info["hidden_size"] hs = model_info["hidden_size"]
template_config[ZERO_OPTIMIZATION]['reduce_bucket_size'] = hs * hs
template_config[ZERO_OPTIMIZATION][ template_config[ZERO_OPTIMIZATION][
ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE] = hs * hs 'stage3_prefetch_bucket_size'] = 0.9 * hs * hs
template_config[ZERO_OPTIMIZATION][
ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE] = 0.9 * hs * hs
template_config[ZERO_OPTIMIZATION][ template_config[ZERO_OPTIMIZATION][
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD] = 10 * hs 'stage3_param_persistence_threshold'] = 10 * hs
prefix = "z3_" prefix = "z3_"
else: else:
return exps return exps
...@@ -355,11 +383,11 @@ class Autotuner: ...@@ -355,11 +383,11 @@ class Autotuner:
logger.debug(f"tuning_keys = {tuning_keys}") logger.debug(f"tuning_keys = {tuning_keys}")
logger.debug(f"before prunning total configs = {len(all_configs)}") logger.debug(f"before pruning total configs = {len(all_configs)}")
pruned_list = prune_configs(all_configs) pruned_list = prune_configs(all_configs)
logger.debug(f"after prunning total configs = {len(pruned_list)}") logger.debug(f"after pruning total configs = {len(pruned_list)}")
for config in pruned_list: for config in pruned_list:
exp_config = copy.deepcopy(template_config) exp_config = copy.deepcopy(template_config)
...@@ -375,7 +403,6 @@ class Autotuner: ...@@ -375,7 +403,6 @@ class Autotuner:
if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[ if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[
ZERO_OPTIMIZATION]: ZERO_OPTIMIZATION]:
del exp_config[ZERO_OPTIMIZATION][OFFLOAD_PARAM] del exp_config[ZERO_OPTIMIZATION][OFFLOAD_PARAM]
# set gradient accumulation steps according to max_train_batch_size_per_gpu # set gradient accumulation steps according to max_train_batch_size_per_gpu
mbs = exp_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] mbs = exp_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
gas = max_train_batch_size_per_gpu // mbs gas = max_train_batch_size_per_gpu // mbs
...@@ -396,6 +423,10 @@ class Autotuner: ...@@ -396,6 +423,10 @@ class Autotuner:
def tune(self): def tune(self):
""" Tunes Zero stages, micro batch size per GPU, and other Zero configurations. Performance metrics of different tuning spaces are recorded in self.records. """ Tunes Zero stages, micro batch size per GPU, and other Zero configurations. Performance metrics of different tuning spaces are recorded in self.records.
""" """
if has_mlflow:
self.mlflow_parent_id = os.environ['MLFLOW_RUN_ID']
mlflow.start_run(run_id=self.mlflow_parent_id)
self.start_time = time.time() self.start_time = time.time()
if self.fast_enabled(): if self.fast_enabled():
logger.info(f"Fast mode is enabled. Tuning micro batch size only.") logger.info(f"Fast mode is enabled. Tuning micro batch size only.")
...@@ -420,9 +451,11 @@ class Autotuner: ...@@ -420,9 +451,11 @@ class Autotuner:
f"The model requires at least {memory_to_string(self.activation_mem, postfix='B')} activation memory for micro batch size 1." f"The model requires at least {memory_to_string(self.activation_mem, postfix='B')} activation memory for micro batch size 1."
) )
#TODO: FIX THIS
stage = self.user_config.get(ZERO_OPTIMIZATION, stage = self.user_config.get(ZERO_OPTIMIZATION,
{}).get(ZERO_OPTIMIZATION_STAGE, {}).get(ZERO_OPTIMIZATION_STAGE,
"all") "all")
stage = "all"
user_zero_stages = [stage] if not isinstance(stage, list) else stage user_zero_stages = [stage] if not isinstance(stage, list) else stage
logger.info(f"User-defined zero stages are {stage}.") logger.info(f"User-defined zero stages are {stage}.")
...@@ -431,9 +464,9 @@ class Autotuner: ...@@ -431,9 +464,9 @@ class Autotuner:
metric_val = 0 metric_val = 0
required_gpu_mem = self.get_instantiation_memory_required_per_gpu( required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
ZERO_OPTIMIZATION_DISABLED) + self.activation_mem ZeroStageEnum.disabled) + self.activation_mem
if self.gpu_mem > required_gpu_mem: if self.gpu_mem > required_gpu_mem:
if "all" in user_zero_stages or ZERO_OPTIMIZATION_DISABLED in user_zero_stages: if "all" in user_zero_stages or ZeroStageEnum.disabled in user_zero_stages:
logger.info( logger.info(
f"The model might be runable with ZERO 0 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning space" f"The model might be runable with ZERO 0 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning space"
) )
...@@ -443,15 +476,17 @@ class Autotuner: ...@@ -443,15 +476,17 @@ class Autotuner:
mbs = next_mbs mbs = next_mbs
max_mbs = next_max_mbs max_mbs = next_max_mbs
metric_val = next_metric_val metric_val = next_metric_val
if has_mlflow:
mlflow.log_metric(f"z0{self.metric()}", next_metric_val)
else: else:
logger.info( logger.info(
f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_DISABLED} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)" f"The model is not runable with ZERO stage {ZeroStageEnum.disabled} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
) )
required_gpu_mem = self.get_instantiation_memory_required_per_gpu( required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
ZERO_OPTIMIZATION_OPTIMIZER_STATES) + self.activation_mem ZeroStageEnum.optimizer_states) + self.activation_mem
if self.gpu_mem > required_gpu_mem: if self.gpu_mem > required_gpu_mem:
if "all" in user_zero_stages or ZERO_OPTIMIZATION_OPTIMIZER_STATES in user_zero_stages: if "all" in user_zero_stages or ZeroStageEnum.optimizer_states in user_zero_stages:
logger.info( logger.info(
f"The model might be runable with ZERO 1 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space" f"The model might be runable with ZERO 1 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space"
) )
...@@ -461,15 +496,17 @@ class Autotuner: ...@@ -461,15 +496,17 @@ class Autotuner:
mbs = next_mbs mbs = next_mbs
max_mbs = next_max_mbs max_mbs = next_max_mbs
metric_val = next_metric_val metric_val = next_metric_val
if has_mlflow:
mlflow.log_metric(f"z1{self.metric()}", next_metric_val)
else: else:
logger.info( logger.info(
f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_OPTIMIZER_STATES} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)" f"The model is not runable with ZERO stage {ZeroStageEnum.optimizer_states} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
) )
required_gpu_mem = self.get_instantiation_memory_required_per_gpu( required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
ZERO_OPTIMIZATION_GRADIENTS) + self.activation_mem ZeroStageEnum.gradients) + self.activation_mem
if self.gpu_mem > required_gpu_mem: if self.gpu_mem > required_gpu_mem:
if "all" in user_zero_stages or ZERO_OPTIMIZATION_GRADIENTS in user_zero_stages: if "all" in user_zero_stages or ZeroStageEnum.gradients in user_zero_stages:
logger.info( logger.info(
f"The model might be runable with ZERO 2 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning space" f"The model might be runable with ZERO 2 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning space"
) )
...@@ -479,25 +516,31 @@ class Autotuner: ...@@ -479,25 +516,31 @@ class Autotuner:
mbs = next_mbs mbs = next_mbs
max_mbs = next_max_mbs max_mbs = next_max_mbs
metric_val = next_metric_val metric_val = next_metric_val
if has_mlflow:
mlflow.log_metric(f"z2{self.metric()}", next_metric_val)
else: else:
logger.info( logger.info(
f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_GRADIENTS} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)" f"The model is not runable with ZERO stage {ZeroStageEnum.gradients} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
) )
required_gpu_mem = self.get_instantiation_memory_required_per_gpu( required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
ZERO_OPTIMIZATION_WEIGHTS) + self.activation_mem ZeroStageEnum.weights) + self.activation_mem
if self.gpu_mem > required_gpu_mem: if self.gpu_mem > required_gpu_mem:
if "all" in user_zero_stages or ZERO_OPTIMIZATION_WEIGHTS in user_zero_stages: if "all" in user_zero_stages or ZeroStageEnum.weights in user_zero_stages:
logger.info( logger.info(
f"The model might be runable with ZERO 3 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning space" f"The model might be runable with ZERO 3 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning space"
) )
_, _, _ = self.tune_space( _, _, next_metric_val = self.tune_space(
DEFAULT_TUNING_SPACE_ZERO_3, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val) DEFAULT_TUNING_SPACE_ZERO_3, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
if has_mlflow:
mlflow.log_metric(f"z3{self.metric()}", next_metric_val)
else: else:
logger.info( logger.info(
f"The model has {self.get_model_num_params()} parameters and requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory per GPU with DeepSpeed Zero stage {ZERO_OPTIMIZATION_WEIGHTS} optimization. Memory per GPU in system is {memory_to_string(self.gpu_mem)}. No tuning is performed." f"The model has {self.get_model_num_params()} parameters and requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory per GPU with DeepSpeed Zero stage {ZeroStageEnum.weights} optimization. Memory per GPU in system is {memory_to_string(self.gpu_mem)}. No tuning is performed."
) )
return return
if has_mlflow:
mlflow.end_run()
def tune_space(self, def tune_space(self,
tuning_space, tuning_space,
...@@ -505,7 +548,7 @@ class Autotuner: ...@@ -505,7 +548,7 @@ class Autotuner:
prev_best_mbs=0, prev_best_mbs=0,
prev_best_metric_val=0): prev_best_metric_val=0):
config_zero = tuning_space.get(ZERO_OPTIMIZATION, {}) config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, ZERO_OPTIMIZATION_STAGE_DEFAULT) stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage) tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage)
tuning_micro_batch_sizes = [] tuning_micro_batch_sizes = []
max_train_batch_size_per_gpu = 0 max_train_batch_size_per_gpu = 0
...@@ -785,11 +828,12 @@ class Autotuner: ...@@ -785,11 +828,12 @@ class Autotuner:
self.rm.schedule_experiments(exp_paths) self.rm.schedule_experiments(exp_paths)
self.rm.run() self.rm.run()
for exp_id, (exp, err) in self.rm.finished_experiments.items(): for exp_id, (exp, err) in self.rm.finished_experiments.items():
if exp: if exp:
metric_file = exp[DS_CONFIG][AUTOTUNING][AUTOTUNING_METRIC_PATH] metric_file = exp[DS_CONFIG][AUTOTUNING][AUTOTUNING_METRIC_PATH]
if os.path.exists(metric_file): if os.path.exists(metric_file):
with open(metric_file, 'r') as f: with open(metric_file, 'r') as f:
results = hjson.load(f) results = hjson.load(f)
metric_val = results[self.metric()] metric_val = results[self.metric()]
...@@ -797,11 +841,19 @@ class Autotuner: ...@@ -797,11 +841,19 @@ class Autotuner:
if max_micro_batch_size == exp[DS_CONFIG][ if max_micro_batch_size == exp[DS_CONFIG][
TRAIN_MICRO_BATCH_SIZE_PER_GPU]: TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
max_micro_batch_size_metric_val = metric_val max_micro_batch_size_metric_val = metric_val
if has_mlflow:
os.environ.pop('MLFLOW_RUN_ID')
mlflow.start_run(nested=True, run_name=exp['name'])
for metric in results:
mlflow.log_metric(metric, results[metric])
mlflow.end_run()
os.environ['MLFLOW_RUN_ID'] = self.mlflow_parent_id
else: else:
self.update_records(tuning_space_name, exp, 0, 1) self.update_records(tuning_space_name, exp, 0, 1)
else: else:
mbs = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] mbs = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]
logger.info(f"micro batch size = {mbs} was not run successfully") logger.info(f"micro batch size = {mbs} was not run successfully")
self.rm.clear() self.rm.clear()
if tuning_micro_batch_sizes_overwritten: if tuning_micro_batch_sizes_overwritten:
...@@ -831,7 +883,18 @@ class Autotuner: ...@@ -831,7 +883,18 @@ class Autotuner:
self.exp_num_gpus * self.exp_num_nodes // self.mp_size() self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mbs) exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mbs)
exp, metric_val = self.run_ds_config(ds_config, exp_name) exp, metric_val = self.run_ds_config(ds_config, exp_name)
if metric_val: if metric_val:
with open(metric_file, 'r') as f:
results = hjson.load(f)
metric_val = results[self.metric()]
if has_mlflow:
os.environ.pop('MLFLOW_RUN_ID')
mlflow.start_run(nested=True, run_name=exp_name)
for metric in results:
mlflow.log_metric(metric, results[metric])
mlflow.end_run()
os.environ['MLFLOW_RUN_ID'] = self.mlflow_parent_id
self.update_records(tuning_space_name, exp, metric_val, 1) self.update_records(tuning_space_name, exp, metric_val, 1)
if metric_val > prev_best_metric_val * (1 + METRIC_PERCENT_DIFF_CONST): if metric_val > prev_best_metric_val * (1 + METRIC_PERCENT_DIFF_CONST):
prev_best_metric_val = metric_val prev_best_metric_val = metric_val
...@@ -843,7 +906,6 @@ class Autotuner: ...@@ -843,7 +906,6 @@ class Autotuner:
break break
if prev_best_mbs != max_micro_batch_size: if prev_best_mbs != max_micro_batch_size:
tuning_micro_batch_sizes[-1] = prev_best_mbs tuning_micro_batch_sizes[-1] = prev_best_mbs
return tuning_micro_batch_sizes return tuning_micro_batch_sizes
def get_min_max_micro_batch_size(self, def get_min_max_micro_batch_size(self,
...@@ -961,11 +1023,10 @@ class Autotuner: ...@@ -961,11 +1023,10 @@ class Autotuner:
low = min_micro_batch_size low = min_micro_batch_size
high = max_micro_batch_size high = max_micro_batch_size
while low < high: # binary search until low is the smallest micro batch size that OOMs.
while low <= high:
mid = int((low + high) // 2) mid = int((low + high) // 2)
logger.debug(f"trying mbs = {mid}, low = {low}, high = {high}") logger.debug(f"trying mbs = {mid}, low = {low}, high = {high}")
if mid == low:
break
if mid not in used_micro_batch_sizes: if mid not in used_micro_batch_sizes:
ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mid ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mid
ds_config[TRAIN_BATCH_SIZE] = mid * gas * \ ds_config[TRAIN_BATCH_SIZE] = mid * gas * \
...@@ -973,7 +1034,7 @@ class Autotuner: ...@@ -973,7 +1034,7 @@ class Autotuner:
exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mid) exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mid)
exp, metric_val = self.run_ds_config(ds_config, exp_name) exp, metric_val = self.run_ds_config(ds_config, exp_name)
if metric_val: if metric_val:
low = mid low = mid + 1
self.update_records(tuning_space_name, exp, metric_val, 1) self.update_records(tuning_space_name, exp, metric_val, 1)
used_micro_batch_sizes.append(mid) used_micro_batch_sizes.append(mid)
if prev_metric_val and ((metric_val - prev_metric_val) / if prev_metric_val and ((metric_val - prev_metric_val) /
...@@ -985,8 +1046,8 @@ class Autotuner: ...@@ -985,8 +1046,8 @@ class Autotuner:
self.update_records(tuning_space_name, exp, 0, 1) self.update_records(tuning_space_name, exp, 0, 1)
high = mid - 1 high = mid - 1
else: else:
low = mid low = mid + 1
max_micro_batch_size = low max_micro_batch_size = low - 1
logger.info( logger.info(
f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}." f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}."
...@@ -1084,26 +1145,18 @@ class Autotuner: ...@@ -1084,26 +1145,18 @@ class Autotuner:
json.dump(exp_config, fd) json.dump(exp_config, fd)
fd.flush() fd.flush()
os.fsync(fd) os.fsync(fd)
self.rm.schedule_experiments([exp_path]) self.rm.schedule_experiments([exp_path])
self.rm.run() self.rm.run()
exp, metric_val = self.rm.parse_results(self.metric()) exp, metric_val = self.rm.parse_results(self.metric())
self.rm.clear() self.rm.clear()
return exp, metric_val return exp, metric_val
def run_after_tuning(self): def write_optimal_config(self):
""" Launches the training with the optmimal DeepSpeed configuration found through the autotuning process.
"ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
"""
best_space_records = self.get_best_space_records() best_space_records = self.get_best_space_records()
if GLOBAL_TUNING_SPACE not in best_space_records: if GLOBAL_TUNING_SPACE not in best_space_records:
return return
best_exp, best_metric_val, _ = best_space_records[GLOBAL_TUNING_SPACE] best_exp, best_metric_val, _ = best_space_records[GLOBAL_TUNING_SPACE]
if best_exp: if best_exp:
logger.info(
"Start training with the optmimal DeepSpeed configuration found through the tuning process"
)
exp_dir = best_exp["result_dir"] exp_dir = best_exp["result_dir"]
cmd = None cmd = None
with open(os.path.join(exp_dir, "cmd.txt"), "r") as f: with open(os.path.join(exp_dir, "cmd.txt"), "r") as f:
...@@ -1115,18 +1168,27 @@ class Autotuner: ...@@ -1115,18 +1168,27 @@ class Autotuner:
ds_config_path = os.path.join(self.results_dir, "ds_config_optimal.json") ds_config_path = os.path.join(self.results_dir, "ds_config_optimal.json")
json.dump(ds_config, open(ds_config_path, "w")) json.dump(ds_config, open(ds_config_path, "w"))
idx = cmd.index(os.path.join(exp_dir, "ds_config.json"))
cmd[idx] = ds_config_path
cmd_path = os.path.join(self.results_dir, "cmd_optimal.txt") cmd_path = os.path.join(self.results_dir, "cmd_optimal.txt")
with open(cmd_path, "w") as fd: with open(cmd_path, "w") as fd:
fd.write(" ".join(cmd)) fd.write(" ".join(cmd))
fd.write("\n") fd.write("\n")
fd.flush() fd.flush()
self.optimal_cmd = cmd
self.optmal_ds_config = ds_config
logger.info(
f"Wrote the optimal DeepSpeed configuration found by autotuning to {ds_config_path}, and the corresponding DeepSpeed command to {cmd_path}"
)
result = subprocess.Popen(cmd) def run_after_tuning(self):
""" Launches the training with the optimal DeepSpeed configuration found through the autotuning process.
"ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
"""
if self.optimal_cmd:
result = subprocess.Popen(self.optimal_cmd)
result.wait() result.wait()
logger.info( logger.info(
f"Done running with the optimal DeepSpeed configuration found by autotuning: {ds_config_path}" f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}"
) )
else:
logger.info(f"No optimal DeepSpeed configuration found by autotuning.")
'''Copyright The Microsoft DeepSpeed Team'''
""" """
Copyright (c) Microsoft Corporation Copyright (c) Microsoft Corporation
Licensed under the MIT license. Licensed under the MIT license.
...@@ -41,11 +42,11 @@ class DeepSpeedAutotuningConfig(DeepSpeedConfigObject): ...@@ -41,11 +42,11 @@ class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
self.results_dir = get_scalar_param(autotuning_dict, self.results_dir = get_scalar_param(autotuning_dict,
AUTOTUNING_RESULTS_DIR, AUTOTUNING_RESULTS_DIR,
AUTOTUNING_RESULTS_DIR_DEFAULT) AUTOTUNING_RESULTS_DIR_DEFAULT)
assert self.results_dir, "results_dir cannot be empty"
self.exps_dir = get_scalar_param(autotuning_dict, self.exps_dir = get_scalar_param(autotuning_dict,
AUTOTUNING_EXPS_DIR, AUTOTUNING_EXPS_DIR,
AUTOTUNING_EXPS_DIR_DEFAULT) AUTOTUNING_EXPS_DIR_DEFAULT)
assert self.exps_dir, "exps_dir cannot be empty"
self.overwrite = get_scalar_param(autotuning_dict, self.overwrite = get_scalar_param(autotuning_dict,
AUTOTUNING_OVERWRITE, AUTOTUNING_OVERWRITE,
AUTOTUNING_OVERWRITE_DEFAULT) AUTOTUNING_OVERWRITE_DEFAULT)
......
'''Copyright The Microsoft DeepSpeed Team'''
""" """
Copyright (c) Microsoft Corporation Copyright (c) Microsoft Corporation
Licensed under the MIT license. Licensed under the MIT license.
...@@ -22,9 +23,6 @@ DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__f ...@@ -22,9 +23,6 @@ DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__f
"config_templates", "config_templates",
"template_zero3.json") "template_zero3.json")
DEFAULT_EXPRS_DIR = os.path.join(os.getcwd(), "autotuning_exps")
DEFAULT_RESULTS_DIR = os.path.join(os.getcwd(), "autotuning_results")
METRIC_PERCENT_DIFF_CONST = 0.05 METRIC_PERCENT_DIFF_CONST = 0.05
DS_CONFIG = "ds_config" DS_CONFIG = "ds_config"
BUFSIZE = 1 # line buffer size for writing files BUFSIZE = 1 # line buffer size for writing files
...@@ -54,10 +52,10 @@ AUTOTUNING_FAST = "fast" ...@@ -54,10 +52,10 @@ AUTOTUNING_FAST = "fast"
AUTOTUNING_FAST_DEFAULT = True AUTOTUNING_FAST_DEFAULT = True
AUTOTUNING_RESULTS_DIR = "results_dir" AUTOTUNING_RESULTS_DIR = "results_dir"
AUTOTUNING_RESULTS_DIR_DEFAULT = None AUTOTUNING_RESULTS_DIR_DEFAULT = "autotuning_results"
AUTOTUNING_EXPS_DIR = "exps_dir" AUTOTUNING_EXPS_DIR = "exps_dir"
AUTOTUNING_EXPS_DIR_DEFAULT = None AUTOTUNING_EXPS_DIR_DEFAULT = "autotuning_exps"
AUTOTUNING_OVERWRITE = "overwrite" AUTOTUNING_OVERWRITE = "overwrite"
AUTOTUNING_OVERWRITE_DEFAULT = True AUTOTUNING_OVERWRITE_DEFAULT = True
......
'''Copyright The Microsoft DeepSpeed Team'''
import copy import copy
from re import I
from numpy import BUFSIZE from numpy import BUFSIZE
from deepspeed.env_report import SUCCESS
from enum import Flag
import json import json
import os
import subprocess import subprocess
import sys import sys
import threading import threading
import time import time
from pathlib import Path import base64
from typing import List
import os
import hjson import hjson
from tqdm import tqdm from tqdm import tqdm
from ..utils import logger from ..utils import logger
from .constants import *
from .constants import AUTOTUNING, AUTOTUNING_METRIC_PATH from .constants import AUTOTUNING, AUTOTUNING_METRIC_PATH
from .utils import get_val_by_key, search_error, was_interruptted from .utils import get_val_by_key, search_error, was_interruptted
""" """
...@@ -25,9 +22,7 @@ thread-0: loop over experiment queue dispatching experiments if they become avai ...@@ -25,9 +22,7 @@ thread-0: loop over experiment queue dispatching experiments if they become avai
thread-N: start each experiment in its own thread thread-N: start each experiment in its own thread
""" """
import torch.distributed as dist from deepspeed import comm as dist
from datetime import datetime
TIMEOUT = 5 TIMEOUT = 5
...@@ -188,7 +183,6 @@ class ResourceManager: ...@@ -188,7 +183,6 @@ class ResourceManager:
logger.debug(f'Put exp_id = {exp["exp_id"]} back into the queue') logger.debug(f'Put exp_id = {exp["exp_id"]} back into the queue')
self.experiment_check(pbar) self.experiment_check(pbar)
else: else:
desc = "" desc = ""
for reservation in reservations: for reservation in reservations:
reservation.slots.sort() reservation.slots.sort()
...@@ -344,19 +338,27 @@ def run_experiment(exp: dict, reservations, user_script, user_args): ...@@ -344,19 +338,27 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
exp["job_id"] = get_job_id() exp["job_id"] = get_job_id()
exp_dir = exp["result_dir"] exp_dir = exp["result_dir"]
os.makedirs(exp_dir, exist_ok=True) os.makedirs(exp_dir, exist_ok=True)
ds_config_path = os.path.join(exp_dir, "ds_config.json")
exp["ds_config_path"] = os.path.join(exp_dir, "ds_config.json") exp["ds_config_path"] = ds_config_path
ds_config = copy.deepcopy(exp["ds_config"]) ds_config = copy.deepcopy(exp["ds_config"])
ds_config_json = json.dumps(ds_config).encode('utf-8')
exp["ds_config_base64"] = base64.urlsafe_b64encode(ds_config_json).decode('utf-8')
with open(exp["ds_config_path"], "w", buffering=BUFSIZE) as fd: with open(exp["ds_config_path"], "w", buffering=BUFSIZE) as fd:
json.dump(ds_config, fd) json.dump(ds_config, fd)
fd.flush() fd.flush()
os.fsync(fd) os.fsync(fd)
path = exp["ds_config_path"]
logger.info(f"Scheduler wrote ds_config to {path}, {os.path.abspath(path)}")
with open(os.path.join(exp_dir, "exp.json"), "w", buffering=BUFSIZE) as fd: with open(os.path.join(exp_dir, "exp.json"), "w", buffering=BUFSIZE) as fd:
json.dump(exp, fd) json.dump(exp, fd)
fd.flush() fd.flush()
os.fsync(fd) os.fsync(fd)
path = os.path.join(exp_dir, "exp.json")
logger.info(f"Scheduler wrote exp to {path}, {os.path.abspath(path)}")
# remove "--deepspeed_config ds_config.json" from user_args # remove "--deepspeed_config ds_config.json" from user_args
if user_args: if user_args:
...@@ -365,9 +367,10 @@ def run_experiment(exp: dict, reservations, user_script, user_args): ...@@ -365,9 +367,10 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
# "--deepspeed_config" is omitted in HF # "--deepspeed_config" is omitted in HF
elif "--deepspeed" in user_args: elif "--deepspeed" in user_args:
idx = user_args.index("--deepspeed") idx = user_args.index("--deepspeed")
assert idx < len(user_args) and ".json" in user_args[idx + assert idx < len(user_args), "there is no ds_config file specified after --deepspeed_config or --deepspeed"
1], "there is no ds_config file specified after --deepspeed_config or --deepspeed" # user_args[idx + 1] = exp["ds_config_path"]
user_args[idx + 1] = exp["ds_config_path"] # pass base64 serialized ds_config to launcher
user_args[idx + 1] = exp["ds_config_base64"]
exp["user_script"] = user_script exp["user_script"] = user_script
exp["user_args"] = user_args exp["user_args"] = user_args
...@@ -382,7 +385,9 @@ def run_experiment(exp: dict, reservations, user_script, user_args): ...@@ -382,7 +385,9 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
fd.flush() fd.flush()
os.fsync(fd) os.fsync(fd)
logger.info(f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}") logger.info(
f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}, and ds_config = {os.path.abspath(ds_config_path)}"
)
with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open( with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(
os.path.join(exp_dir, "stderr.log"), "wb" os.path.join(exp_dir, "stderr.log"), "wb"
...@@ -396,7 +401,9 @@ def run_experiment(exp: dict, reservations, user_script, user_args): ...@@ -396,7 +401,9 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
clean_up(exp, reservations) clean_up(exp, reservations)
logger.info(f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}") logger.info(
f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}"
)
PDSH_MAX_FAN_OUT = 1024 PDSH_MAX_FAN_OUT = 1024
......
'''Copyright The Microsoft DeepSpeed Team'''
from .index_based_tuner import RandomTuner, GridSearchTuner from .index_based_tuner import RandomTuner, GridSearchTuner
# from .ga_tuner import GATuner # from .ga_tuner import GATuner
from .model_based_tuner import ModelBasedTuner from .model_based_tuner import ModelBasedTuner
import atexit '''Copyright The Microsoft DeepSpeed Team'''
import sys import sys
from deepspeed.autotuning.constants import * from deepspeed.autotuning.constants import *
from deepspeed.autotuning.utils import write_experiments from deepspeed.autotuning.utils import write_experiments
from deepspeed.utils import logger from deepspeed.utils import logger
import json
class BaseTuner: class BaseTuner:
def __init__(self, exps, resource_manager, metric): def __init__(self, exps, resource_manager, metric):
......
import numpy as np '''Copyright The Microsoft DeepSpeed Team'''
from .utils import * from .utils import *
......
import random '''Copyright The Microsoft DeepSpeed Team'''
from deepspeed.utils import logger import random
from .base_tuner import BaseTuner from .base_tuner import BaseTuner
......
'''Copyright The Microsoft DeepSpeed Team'''
import hjson import hjson
import numpy as np
from deepspeed.utils import logger
from ..constants import AUTOTUNING, AUTOTUNING_METRIC_PATH, AUTOTUNING_METRIC_DEFAULT from ..constants import AUTOTUNING, AUTOTUNING_METRIC_PATH
from .base_tuner import BaseTuner from .base_tuner import BaseTuner
from .cost_model import XGBoostCostModel from .cost_model import XGBoostCostModel
from .utils import * from .utils import *
......
'''Copyright The Microsoft DeepSpeed Team'''
import numpy as np import numpy as np
import itertools import itertools
from ..utils import * from ..utils import *
......
'''Copyright The Microsoft DeepSpeed Team'''
import re import re
import collections.abc import collections.abc
import os import os
import json import json
from deepspeed.runtime.constants import GRADIENT_ACCUMULATION_STEPS, TRAIN_MICRO_BATCH_SIZE_PER_GPU from deepspeed.runtime.constants import GRADIENT_ACCUMULATION_STEPS, TRAIN_MICRO_BATCH_SIZE_PER_GPU
import hjson
import sys
import itertools import itertools
import copy import copy
...@@ -35,23 +35,11 @@ def was_interruptted(filename): ...@@ -35,23 +35,11 @@ def was_interruptted(filename):
return False return False
def was_interruptted(filename):
if not os.path.exists(filename):
return "stderr.log does not exist"
with open(filename) as f:
for line in f:
s = "KeyboardInterrupt"
idx = line.find(s)
if idx != -1:
return True
return False
def find_replace_str(value, replace_dict): def find_replace_str(value, replace_dict):
if not isinstance(value, str): if not isinstance(value, str):
return str(value) return str(value)
matches = re.findall("\$[A-Za-z0-9_]+", value) matches = re.findall(r"\$[A-Za-z0-9_]+", value)
for var in matches: for var in matches:
var_key = var.replace("$", "").lower() var_key = var.replace("$", "").lower()
if var_key == "nvme_path": if var_key == "nvme_path":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment