Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
#pragma once
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include "ds_kernel_utils.h"
#include <cuda.h>
#include <cuda_fp16.h>
......@@ -19,12 +15,17 @@
#define MAX_WARP_NUM 32
#define WARP_SIZE 32
#define MAX_THREADS 1024
#define SMs 80
#define MAX_REGISTERS 256
template <typename T>
void launch_attn_softmax_v2(T* vals,
T* mask,
T* alibi,
float layer_scale,
bool triangular,
bool recompute,
bool local_attention,
......@@ -33,7 +34,9 @@ void launch_attn_softmax_v2(T* vals,
int heads,
int num_seq,
int sequence_length,
float scale,
int offset,
int mask_stride,
int mp_size,
cudaStream_t stream);
// Fused bias add with gelu activation
......@@ -43,6 +46,23 @@ void launch_bias_gelu(T* input,
int intermediate_size,
int batch_size,
cudaStream_t stream);
template <typename T>
void launch_fused_bias_geglu(T* output,
const T* activation,
const T* bias,
int rows,
int elems_per_row,
cudaStream_t stream);
// Fused bias add with relu activation
template <typename T>
void launch_bias_relu(T* input,
const T* bias,
int intermediate_size,
int batch_size,
cudaStream_t stream);
template <typename T>
void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
......@@ -55,32 +75,44 @@ void launch_bias_residual(T* input,
int batch,
int hidden_dim,
int mp_size,
bool preln,
cudaStream_t stream);
template <typename T>
void launch_layer_norm(T* out,
T* vals,
void launch_fused_ln(T* output,
const T* vals,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
int rows,
int elems_per_row,
cudaStream_t stream);
template <typename T>
void launch_residual_layer_norm(T* norm,
T* res_add,
T* vals,
T* residual,
void launch_fused_residual_ln(T* output,
const T* vals,
const T* residual,
const T* bias,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
bool preLN,
bool mlp_after_attn,
int rows,
int elems_per_row,
cudaStream_t stream);
template <typename T>
void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
T* res_output,
const T* vals,
const T* residual,
const T* bias,
const T* gamma,
const T* beta,
float epsilon,
int rows,
int elems_per_row,
cudaStream_t stream);
template <typename T>
void launch_dequantize(T* output,
const int8_t* input,
......@@ -92,6 +124,14 @@ void launch_dequantize(T* output,
cudaStream_t stream);
template <typename T>
void launch_dequantize(T* output,
const int8_t* input,
const float* qscale,
unsigned output_size,
unsigned hidden_dim,
unsigned groups,
cudaStream_t stream);
template <typename T>
void launch_gptj_residual_add(T* input,
T* output,
T* attn,
......@@ -113,7 +153,8 @@ void launch_apply_rotary_pos_emb(T* mixed_query,
unsigned batch,
bool rotate_half,
bool rotate_every_two,
cudaStream_t stream);
cudaStream_t stream,
int max_out_tokens);
template <typename T>
void launch_moe_res_matmul(T* residual,
......@@ -122,3 +163,60 @@ void launch_moe_res_matmul(T* residual,
int seq_len,
int hidden_dim,
cudaStream_t stream);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template <typename T>
void launch_transform4d_0213(T* out,
const T* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
cudaStream_t stream,
int trans_count);
template <typename T>
void launch_bias_add_transform_0213(T* outputs,
T* vals,
T* vals1,
const T* vals2,
const T* bias,
int batch_size,
int seq_length,
unsigned seq_offset,
int seq_length1,
int hidden_dim,
int heads,
int rotary_dim,
bool rotate_half,
bool rotate_every_two,
cudaStream_t stream,
int trans_count,
int max_out_tokens);
template <typename T>
void pad_data(T* padded_output,
T* output,
int bsz,
int head_size,
int padded_head_size,
cudaStream_t stream);
template <typename T>
void pad_head_seq(T* padded_output,
T* output,
int bsz,
int seq_len,
int padded_seq_len,
int head_size,
int padded_head_size,
cudaStream_t stream);
template <typename T>
void launch_pad_add_transform_0213(T* output,
const T* vals,
int batch_size,
int hidden_dim,
int seq_length,
int padded_seq_len,
int heads,
int padded_head_size,
cudaStream_t stream);
// !!! This is a file automatically generated by hipify!!!
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
#pragma once
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include "ds_kernel_utils_hip.h"
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
......@@ -20,12 +16,17 @@
#define MAX_WARP_NUM 32
#define WARP_SIZE 32
#define MAX_THREADS 1024
#define SMs 80
#define MAX_REGISTERS 256
template <typename T>
void launch_attn_softmax_v2(T* vals,
T* mask,
T* alibi,
float layer_scale,
bool triangular,
bool recompute,
bool local_attention,
......@@ -34,7 +35,9 @@ void launch_attn_softmax_v2(T* vals,
int heads,
int num_seq,
int sequence_length,
float scale,
int offset,
int mask_stride,
int mp_size,
hipStream_t stream);
// Fused bias add with gelu activation
......@@ -44,6 +47,23 @@ void launch_bias_gelu(T* input,
int intermediate_size,
int batch_size,
hipStream_t stream);
template <typename T>
void launch_fused_bias_geglu(T* output,
const T* activation,
const T* bias,
int rows,
int elems_per_row,
hipStream_t stream);
// Fused bias add with relu activation
template <typename T>
void launch_bias_relu(T* input,
const T* bias,
int intermediate_size,
int batch_size,
hipStream_t stream);
template <typename T>
void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, hipStream_t stream);
......@@ -56,32 +76,44 @@ void launch_bias_residual(T* input,
int batch,
int hidden_dim,
int mp_size,
bool preln,
hipStream_t stream);
template <typename T>
void launch_layer_norm(T* out,
T* vals,
void launch_fused_ln(T* output,
const T* vals,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
int rows,
int elems_per_row,
hipStream_t stream);
template <typename T>
void launch_residual_layer_norm(T* norm,
T* res_add,
T* vals,
T* residual,
void launch_fused_residual_ln(T* output,
const T* vals,
const T* residual,
const T* bias,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
bool preLN,
bool mlp_after_attn,
int rows,
int elems_per_row,
hipStream_t stream);
template <typename T>
void launch_fused_residual_ln_store_pre_ln_res(T* norm_output,
T* res_output,
const T* vals,
const T* residual,
const T* bias,
const T* gamma,
const T* beta,
float epsilon,
int rows,
int elems_per_row,
hipStream_t stream);
template <typename T>
void launch_dequantize(T* output,
const int8_t* input,
......@@ -93,6 +125,14 @@ void launch_dequantize(T* output,
hipStream_t stream);
template <typename T>
void launch_dequantize(T* output,
const int8_t* input,
const float* qscale,
unsigned output_size,
unsigned hidden_dim,
unsigned groups,
hipStream_t stream);
template <typename T>
void launch_gptj_residual_add(T* input,
T* output,
T* attn,
......@@ -114,7 +154,8 @@ void launch_apply_rotary_pos_emb(T* mixed_query,
unsigned batch,
bool rotate_half,
bool rotate_every_two,
hipStream_t stream);
hipStream_t stream,
int max_out_tokens);
template <typename T>
void launch_moe_res_matmul(T* residual,
......@@ -123,3 +164,60 @@ void launch_moe_res_matmul(T* residual,
int seq_len,
int hidden_dim,
hipStream_t stream);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template <typename T>
void launch_transform4d_0213(T* out,
const T* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
hipStream_t stream,
int trans_count);
template <typename T>
void launch_bias_add_transform_0213(T* outputs,
T* vals,
T* vals1,
const T* vals2,
const T* bias,
int batch_size,
int seq_length,
unsigned seq_offset,
int seq_length1,
int hidden_dim,
int heads,
int rotary_dim,
bool rotate_half,
bool rotate_every_two,
hipStream_t stream,
int trans_count,
int max_out_tokens);
template <typename T>
void pad_data(T* padded_output,
T* output,
int bsz,
int head_size,
int padded_head_size,
hipStream_t stream);
template <typename T>
void pad_head_seq(T* padded_output,
T* output,
int bsz,
int seq_len,
int padded_seq_len,
int head_size,
int padded_head_size,
hipStream_t stream);
template <typename T>
void launch_pad_add_transform_0213(T* output,
const T* vals,
int batch_size,
int hidden_dim,
int seq_length,
int padded_seq_len,
int heads,
int padded_head_size,
hipStream_t stream);
/*
Copyright The Microsoft DeepSpeed Team
*/
#include "custom_cuda_layers.h"
namespace cg = cooperative_groups;
......@@ -862,6 +866,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
bool invertible,
int row_stride)
{
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride;
......@@ -985,6 +990,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
inp_grad_h[high_index] = temp;
}
#endif
}
template <>
......@@ -1172,6 +1178,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
__half* inp_grad,
int row_stride)
{
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride;
......@@ -1290,6 +1297,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
__half2 temp = __float22half2_rn(vals_arr_f[iterations]);
inp_grad_h[high_index] = temp;
}
#endif
}
template <>
......@@ -1601,6 +1609,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
bool invertible,
int row_stride)
{
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride;
......@@ -1727,6 +1736,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
inp_grad_h[high_index] = temp + out_grad_h2[high_index];
}
#endif
}
template <>
......@@ -1922,6 +1932,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
__half* inp_grad,
int row_stride)
{
#ifdef HALF_PRECISION_AVAILABLE
int iteration_stride = blockDim.x;
int iterations = row_stride / iteration_stride;
......@@ -2044,6 +2055,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
__half2 temp = __float22half2_rn(vals_arr_f[iterations]);
inp_grad_h[high_index] = temp + out_grad_h2[high_index];
}
#endif
}
template <>
......
/*
Copyright The Microsoft DeepSpeed Team
*/
#include <math.h>
#include "custom_cuda_layers.h"
#include "general_kernels.h"
......@@ -536,6 +540,102 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
}
}
__global__ void softmax_backward_kernel_arbitrary_length(__half* grad /* input & output*/,
const __half* output,
int softmax_length)
{
int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
int offset = batch_idx * softmax_length + threadIdx.x;
const float4* output_cast = reinterpret_cast<const float4*>(output);
float4* grad_cast = reinterpret_cast<float4*>(grad);
grad_cast += offset;
output_cast += offset;
float sum = 0.0;
int curr_idx = threadIdx.x;
while (curr_idx < softmax_length) {
float4 out_reg = output_cast[curr_idx];
float4 grad_reg = grad_cast[curr_idx];
__half2* out_h = reinterpret_cast<__half2*>(&out_reg);
__half2* grad_h = reinterpret_cast<__half2*>(&grad_reg);
#pragma unroll
for (int m = 0; m < 4; m++) grad_h[m] *= out_h[m];
sum += ((float)grad_h[0].x + (float)grad_h[0].y + (float)grad_h[1].x + (float)grad_h[1].y) +
((float)grad_h[2].x + (float)grad_h[2].y + (float)grad_h[3].x + (float)grad_h[3].y);
curr_idx += WARP_SIZE;
}
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#pragma unroll
for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
curr_idx = threadIdx.x;
while (curr_idx < softmax_length) {
float4 out_reg = output_cast[curr_idx];
float4 grad_reg = grad_cast[curr_idx];
__half* grad_h = reinterpret_cast<__half*>(&grad_reg);
__half* out_h = reinterpret_cast<__half*>(&out_reg);
#pragma unroll
for (int m = 0; m < 8; m++) grad_h[m] = (float)out_h[m] * ((float)grad_h[m] - sum);
grad_cast[curr_idx] = grad_reg;
curr_idx += WARP_SIZE;
}
}
__global__ void softmax_backward_kernel_arbitrary_length(float* grad /* input & output*/,
const float* output,
int softmax_length)
{
int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
int offset = batch_idx * softmax_length + threadIdx.x;
const float4* output_cast = reinterpret_cast<const float4*>(output);
float4* grad_cast = reinterpret_cast<float4*>(grad);
grad_cast += offset;
output_cast += offset;
float sum = 0.0;
int curr_idx = threadIdx.x;
while (curr_idx < softmax_length) {
float4 out_reg = output_cast[curr_idx];
float4 grad_reg = grad_cast[curr_idx];
grad_reg.x *= out_reg.x;
grad_reg.y *= out_reg.y;
grad_reg.z *= out_reg.z;
grad_reg.w *= out_reg.w;
sum += (grad_reg.x + grad_reg.y + grad_reg.z + grad_reg.w);
curr_idx += WARP_SIZE;
}
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
#pragma unroll
for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
curr_idx = threadIdx.x;
while (curr_idx < softmax_length) {
float4 out_reg = output_cast[curr_idx];
float4 grad_reg = grad_cast[curr_idx];
grad_reg.x = out_reg.x * (grad_reg.x - sum);
grad_reg.y = out_reg.y * (grad_reg.y - sum);
grad_reg.z = out_reg.z * (grad_reg.z - sum);
grad_reg.w = out_reg.w * (grad_reg.w - sum);
grad_cast[curr_idx] = grad_reg;
curr_idx += WARP_SIZE;
}
}
template <typename T>
void launch_attn_softmax_backward_v2(T* out_grad,
const T* soft_inp,
......@@ -575,10 +675,15 @@ void launch_attn_softmax_backward_v2(T* out_grad,
else if (seq_length <= 2048)
softmax_backward_kernel_v2<T, 64>
<<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
else if (seq_length <= 4096)
softmax_backward_kernel_v2<T, 128>
<<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
else if (seq_length <= 8192)
softmax_backward_kernel_v2<T, 256>
<<<grid_dim, block_dim, 0, stream>>>(out_grad, soft_inp, seq_length);
else
throw std::runtime_error(
std::string("Special sequence length found in softmax backward, seq_length: ") +
std::to_string(seq_length));
softmax_backward_kernel_arbitrary_length<<<grid_dim, block_dim, 0, stream>>>(
out_grad, soft_inp, seq_length / (4 << ((sizeof(T) & 2) >> 1)));
}
template void launch_attn_softmax_backward_v2<__half>(__half* out_grad,
......
/*
Copyright The Microsoft DeepSpeed Team
*/
#include "custom_cuda_layers.h"
#define rows_trans 16
......
......@@ -4,6 +4,7 @@ Copyright 2020 The Microsoft DeepSpeed Team
import sys
import types
import json
from typing import Optional, Union
import torch
from torch.optim import Optimizer
......@@ -17,17 +18,18 @@ from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpe
from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
from .runtime.pipe.engine import PipelineEngine
from .inference.engine import InferenceEngine
from .inference.config import DeepSpeedInferenceConfig
from .runtime.lr_schedules import add_tuning_arguments
from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError
from .runtime.activation_checkpointing import checkpointing
from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
from .module_inject import replace_transformer_layer, revert_transformer_layer
from .utils import log_dist
from .utils.distributed import init_distributed
from .utils import log_dist, OnDevice
from .comm.comm import init_distributed
from .runtime import zero
from .runtime import DeepSpeedOptimizer, ZeROOptimizer
from .pipe import PipelineModule
......@@ -82,7 +84,7 @@ def initialize(args=None,
mpu: Optional: A model parallelism unit object that implements
get_{model,data}_parallel_{rank,group,world_size}()
dist_init_required: Optional: None will auto-initialize torch.distributed if needed,
dist_init_required: Optional: None will auto-initialize torch distributed if needed,
otherwise the user can force it to be initialized or not via boolean.
collate_fn: Optional: Merges a list of samples to form a
......@@ -113,6 +115,10 @@ def initialize(args=None,
__git_hash__,
__git_branch__),
ranks=[0])
# Disable zero.Init context if it's currently enabled
zero.partition_parameters.shutdown_init_context()
assert model is not None, "deepspeed.initialize requires a model"
if not isinstance(model, PipelineModule):
......@@ -217,61 +223,57 @@ def add_config_arguments(parser):
return parser
def init_inference(model,
triangular_masking=True,
mp_size=1,
training_mp_size=1,
mpu=None,
ep_group=None,
expert_mp_group=None,
checkpoint=None,
dtype=None,
injection_policy=None,
replace_method='auto',
quantization_setting=None,
replace_with_kernel_inject=False,
return_tuple=True,
ep_size=1,
moe=False,
moe_experts=1,
moe_type='standard',
args=None):
def default_inference_config():
"""
Return a default DeepSpeed inference configuration dictionary.
"""
return DeepSpeedInferenceConfig().dict()
def init_inference(model, config=None, **kwargs):
"""Initialize the DeepSpeed InferenceEngine.
Arguments:
model: Required: nn.module class before apply any wrappers
Description: all four cases are valid and supported in DS init_inference() API.
triangular_masking: Required: this shows the type of masking for attention scores in transformer layer
note that the masking is application specific.
# Case 1: user provides no config and no kwargs. Default config will be used.
mp_size: Optional: Desired model parallel size, default is 1 meaning no
model parallelism.
.. code-block:: python
training_mp_size: Optional: if loading a checkpoint this is the mp size that it was trained with,
it may be different than what the mp size that you want to use during inference.
generator.model = deepspeed.init_inference(generator.model)
string = generator("DeepSpeed is")
print(string)
mpu: Optional: A model parallelism unit object that implements
get_{model,data}_parallel_{rank,group,world_size}()
# Case 2: user provides a config and no kwargs. User supplied config will be used.
.. code-block:: python
generator.model = deepspeed.init_inference(generator.model, config=config)
string = generator("DeepSpeed is")
print(string)
checkpoint: Optional: Path to deepspeed compatible checkpoint or path to
JSON with load policy.
# Case 3: user provides no config and uses keyword arguments (kwargs) only.
dtype: Optional: Desired model data type, will convert model to this type.
Supported target types: torch.half, torch.int8, torch.float
.. code-block:: python
injection_policy: Optional: Dictionary mapping a client nn.Module to its corresponding
injection policy. e.g., {BertLayer : deepspeed.inference.HFBertLayerPolicy}
generator.model = deepspeed.init_inference(generator.model,
mp_size=world_size,
dtype=torch.half,
replace_with_kernel_inject=True)
string = generator("DeepSpeed is")
print(string)
replace_method: Optional: If 'auto' DeepSpeed will automatically try and replace
model modules with its optimized versions. If an injection_policy is set this will
override the automatic replacement behavior.
# Case 4: user provides config and keyword arguments (kwargs). Both config and kwargs are merged and kwargs take precedence.
quantization_setting: Optional: Quantization settings used for quantizing your model using the MoQ.
The setting can be one element or a tuple. If one value is passed in, we consider it as the number
of groups used in quantization. A tuple is passed in if we want to mention that there is extra-grouping
for the MLP part of a Transformer layer (e.g. (True, 8) shows we quantize the model using 8 groups for
all the network except the MLP part that we use 8 extra grouping).
replace_with_kernel_inject: If set we inject kernel as we initialize the inference-engine
.. code-block:: python
generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
string = generator("DeepSpeed is")
print(string)
Arguments:
model: Required: original nn.module object without any wrappers
config: Optional: instead of arguments, you can pass in a DS inference config dict or path to JSON file
Returns:
A deepspeed.InferenceEngine wrapped model.
......@@ -282,24 +284,30 @@ def init_inference(model,
__git_branch__),
ranks=[0])
engine = InferenceEngine(model,
triangular_masking,
mp_size,
training_mp_size,
ep_size,
mpu,
ep_group,
expert_mp_group,
checkpoint,
dtype,
injection_policy,
return_tuple,
replace_method,
quantization_setting,
replace_with_kernel_inject,
moe,
moe_experts,
moe_type,
args)
# Load config_dict from config first
if config is None:
config = {}
if isinstance(config, str):
with open(config, "r") as f:
config_dict = json.load(f)
elif isinstance(config, dict):
config_dict = config
else:
raise ValueError(
f"'config' argument expected string or dictionary, got {type(config)}")
# Update with values from kwargs, ensuring no conflicting overlap between config and kwargs
overlap_keys = set(config_dict.keys()).intersection(kwargs.keys())
# If there is overlap, error out if values are different
for key in overlap_keys:
if config_dict[key] != kwargs[key]:
raise ValueError(
f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}"
)
config_dict.update(kwargs)
ds_inference_config = DeepSpeedInferenceConfig(**config_dict)
engine = InferenceEngine(model, config=ds_inference_config)
return engine
../accelerator/
\ No newline at end of file
File mode changed from 100644 to 100755
'''Copyright The Microsoft DeepSpeed Team'''
from .autotuner import Autotuner
import copy
import json
import os
from random import sample
'''Copyright The Microsoft DeepSpeed Team'''
import shutil
import subprocess
import hjson
import torch
import time
import datetime
import math
import hjson
from ..runtime.config_utils import dict_raise_error_on_duplicate_keys
from ..runtime.constants import *
from ..runtime.zero.constants import *
from ..runtime.zero.config import ZERO_OPTIMIZATION, ZeroStageEnum
from ..utils import logger
from .config import DeepSpeedAutotuningConfig
from .constants import *
from .scheduler import ResourceManager, run_experiment
from .scheduler import ResourceManager
from .tuner import GridSearchTuner, RandomTuner, ModelBasedTuner
from .utils import *
from deepspeed.accelerator import get_accelerator
try:
from tabulate import tabulate
except ImportError:
tabulate = None
try:
import mlflow
has_mlflow = True
except Exception as e:
has_mlflow = False
ZERO_OPTIMIZATION_STAGE = "stage"
OFFLOAD_OPTIMIZER = "offload_optimizer"
OFFLOAD_PARAM = "offload_param"
ZERO_OPTIMIZATION_STAGE_DEFAULT = ZeroStageEnum.disabled
class Autotuner:
"""The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods.
......@@ -42,22 +52,37 @@ class Autotuner:
assert self.user_config is not None, "DeepSpeed configuration is not provided"
self.autotuning_config = DeepSpeedAutotuningConfig(self.user_config)
if self.user_config[AUTOTUNING]:
if AUTOTUNING_EXPS_DIR in self.user_config[AUTOTUNING].keys():
del self.user_config[AUTOTUNING][AUTOTUNING_EXPS_DIR]
if AUTOTUNING_RESULTS_DIR in self.user_config[AUTOTUNING].keys():
del self.user_config[AUTOTUNING][AUTOTUNING_RESULTS_DIR]
self.exps_dir = DEFAULT_EXPRS_DIR
if self.autotuning_config.exps_dir and self.autotuning_config.exps_dir != "":
self.exps_dir = self.autotuning_config.exps_dir
if self.autotuning_config.overwrite and os.path.exists(self.exps_dir):
shutil.rmtree(self.exps_dir, ignore_errors=True)
if not os.path.exists(self.exps_dir):
try:
os.makedirs(self.exps_dir, exist_ok=True)
logger.info(f"Created autotuning experiments directory: {self.exps_dir}")
except:
logger.error(
f"Failed to create {self.exps_dir}, please check `exps_dir` in the autotuning config file is accessible by all the nodes in the job."
)
exit(-1)
self.results_dir = DEFAULT_RESULTS_DIR
if self.autotuning_config.results_dir and self.autotuning_config.results_dir != "":
self.results_dir = self.autotuning_config.results_dir
if self.autotuning_config.overwrite and os.path.exists(self.results_dir):
shutil.rmtree(self.results_dir, ignore_errors=True)
if not os.path.exists(self.results_dir):
try:
os.makedirs(self.results_dir, exist_ok=True)
logger.info(f"Created autotuning resutls directory: {self.exps_dir}")
except:
logger.error(
f"Failed to create {self.results_dir}, please check `results_dir` in the autotuning config file is accessible by all the nodes in the job."
)
exit(-1)
# set the active resource for the autotuner resource manager
self.rm = self._get_resource_manager(active_resources)
......@@ -70,6 +95,10 @@ class Autotuner:
self.rm.nodes), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
self.records = {}
self.optimal_cmd = None
self.optmal_ds_config = None
self.mlflow_parent_id = None
def print_tuning_results(self):
"""Print the autotuning results in tabular format.
......@@ -252,7 +281,7 @@ class Autotuner:
return False
def get_gpu_memory_info(self):
return torch.cuda.get_device_properties(0).total_memory
return get_accelerator().total_memory()
def get_activation_memory_per_gpu(self):
if self.model_info and "activation_mem_per_gpu" in self.model_info:
......@@ -266,18 +295,18 @@ class Autotuner:
if not num_params:
return 0
# assume the model uses Adam optimizer
# ZERO_OPTIMIZATION_DISABLED:
# ZeroStageEnum.disabled:
params_mem = num_params * (2 if fp16_enabled else 4)
gradients_mem = num_params * (2 if fp16_enabled else 4)
optimizer_mem = num_params * (16 if fp16_enabled else 8)
if zero_stage >= ZERO_OPTIMIZATION_OPTIMIZER_STATES:
if zero_stage >= ZeroStageEnum.optimizer_states:
optimizer_mem = optimizer_mem / total_gpus
if zero_stage >= ZERO_OPTIMIZATION_GRADIENTS:
if zero_stage >= ZeroStageEnum.gradients:
gradients_mem = gradients_mem / total_gpus
if zero_stage >= ZERO_OPTIMIZATION_WEIGHTS:
if zero_stage >= ZeroStageEnum.weights:
params_mem = params_mem / total_gpus
mem_per_gpu = (params_mem + gradients_mem + optimizer_mem) / self.mp_size()
......@@ -308,7 +337,7 @@ class Autotuner:
# each zero stage uses a different template configuration file
config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, ZERO_OPTIMIZATION_STAGE_DEFAULT)
template_config = {}
if stage == 0:
template_path = DEFAULT_TEMPLATE_PATH_ZERO_0
......@@ -331,12 +360,11 @@ class Autotuner:
model_info = self.model_info
if model_info and "hidden_size" in model_info:
hs = model_info["hidden_size"]
template_config[ZERO_OPTIMIZATION]['reduce_bucket_size'] = hs * hs
template_config[ZERO_OPTIMIZATION][
ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE] = hs * hs
'stage3_prefetch_bucket_size'] = 0.9 * hs * hs
template_config[ZERO_OPTIMIZATION][
ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE] = 0.9 * hs * hs
template_config[ZERO_OPTIMIZATION][
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD] = 10 * hs
'stage3_param_persistence_threshold'] = 10 * hs
prefix = "z3_"
else:
return exps
......@@ -355,11 +383,11 @@ class Autotuner:
logger.debug(f"tuning_keys = {tuning_keys}")
logger.debug(f"before prunning total configs = {len(all_configs)}")
logger.debug(f"before pruning total configs = {len(all_configs)}")
pruned_list = prune_configs(all_configs)
logger.debug(f"after prunning total configs = {len(pruned_list)}")
logger.debug(f"after pruning total configs = {len(pruned_list)}")
for config in pruned_list:
exp_config = copy.deepcopy(template_config)
......@@ -375,7 +403,6 @@ class Autotuner:
if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[
ZERO_OPTIMIZATION]:
del exp_config[ZERO_OPTIMIZATION][OFFLOAD_PARAM]
# set gradient accumulation steps according to max_train_batch_size_per_gpu
mbs = exp_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
gas = max_train_batch_size_per_gpu // mbs
......@@ -396,6 +423,10 @@ class Autotuner:
def tune(self):
""" Tunes Zero stages, micro batch size per GPU, and other Zero configurations. Performance metrics of different tuning spaces are recorded in self.records.
"""
if has_mlflow:
self.mlflow_parent_id = os.environ['MLFLOW_RUN_ID']
mlflow.start_run(run_id=self.mlflow_parent_id)
self.start_time = time.time()
if self.fast_enabled():
logger.info(f"Fast mode is enabled. Tuning micro batch size only.")
......@@ -420,9 +451,11 @@ class Autotuner:
f"The model requires at least {memory_to_string(self.activation_mem, postfix='B')} activation memory for micro batch size 1."
)
#TODO: FIX THIS
stage = self.user_config.get(ZERO_OPTIMIZATION,
{}).get(ZERO_OPTIMIZATION_STAGE,
"all")
stage = "all"
user_zero_stages = [stage] if not isinstance(stage, list) else stage
logger.info(f"User-defined zero stages are {stage}.")
......@@ -431,9 +464,9 @@ class Autotuner:
metric_val = 0
required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
ZERO_OPTIMIZATION_DISABLED) + self.activation_mem
ZeroStageEnum.disabled) + self.activation_mem
if self.gpu_mem > required_gpu_mem:
if "all" in user_zero_stages or ZERO_OPTIMIZATION_DISABLED in user_zero_stages:
if "all" in user_zero_stages or ZeroStageEnum.disabled in user_zero_stages:
logger.info(
f"The model might be runable with ZERO 0 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning space"
)
......@@ -443,15 +476,17 @@ class Autotuner:
mbs = next_mbs
max_mbs = next_max_mbs
metric_val = next_metric_val
if has_mlflow:
mlflow.log_metric(f"z0{self.metric()}", next_metric_val)
else:
logger.info(
f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_DISABLED} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
f"The model is not runable with ZERO stage {ZeroStageEnum.disabled} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
)
required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
ZERO_OPTIMIZATION_OPTIMIZER_STATES) + self.activation_mem
ZeroStageEnum.optimizer_states) + self.activation_mem
if self.gpu_mem > required_gpu_mem:
if "all" in user_zero_stages or ZERO_OPTIMIZATION_OPTIMIZER_STATES in user_zero_stages:
if "all" in user_zero_stages or ZeroStageEnum.optimizer_states in user_zero_stages:
logger.info(
f"The model might be runable with ZERO 1 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space"
)
......@@ -461,15 +496,17 @@ class Autotuner:
mbs = next_mbs
max_mbs = next_max_mbs
metric_val = next_metric_val
if has_mlflow:
mlflow.log_metric(f"z1{self.metric()}", next_metric_val)
else:
logger.info(
f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_OPTIMIZER_STATES} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
f"The model is not runable with ZERO stage {ZeroStageEnum.optimizer_states} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
)
required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
ZERO_OPTIMIZATION_GRADIENTS) + self.activation_mem
ZeroStageEnum.gradients) + self.activation_mem
if self.gpu_mem > required_gpu_mem:
if "all" in user_zero_stages or ZERO_OPTIMIZATION_GRADIENTS in user_zero_stages:
if "all" in user_zero_stages or ZeroStageEnum.gradients in user_zero_stages:
logger.info(
f"The model might be runable with ZERO 2 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning space"
)
......@@ -479,25 +516,31 @@ class Autotuner:
mbs = next_mbs
max_mbs = next_max_mbs
metric_val = next_metric_val
if has_mlflow:
mlflow.log_metric(f"z2{self.metric()}", next_metric_val)
else:
logger.info(
f"The model is not runable with ZERO stage {ZERO_OPTIMIZATION_GRADIENTS} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
f"The model is not runable with ZERO stage {ZeroStageEnum.gradients} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
)
required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
ZERO_OPTIMIZATION_WEIGHTS) + self.activation_mem
ZeroStageEnum.weights) + self.activation_mem
if self.gpu_mem > required_gpu_mem:
if "all" in user_zero_stages or ZERO_OPTIMIZATION_WEIGHTS in user_zero_stages:
if "all" in user_zero_stages or ZeroStageEnum.weights in user_zero_stages:
logger.info(
f"The model might be runable with ZERO 3 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning space"
)
_, _, _ = self.tune_space(
_, _, next_metric_val = self.tune_space(
DEFAULT_TUNING_SPACE_ZERO_3, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
if has_mlflow:
mlflow.log_metric(f"z3{self.metric()}", next_metric_val)
else:
logger.info(
f"The model has {self.get_model_num_params()} parameters and requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory per GPU with DeepSpeed Zero stage {ZERO_OPTIMIZATION_WEIGHTS} optimization. Memory per GPU in system is {memory_to_string(self.gpu_mem)}. No tuning is performed."
f"The model has {self.get_model_num_params()} parameters and requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory per GPU with DeepSpeed Zero stage {ZeroStageEnum.weights} optimization. Memory per GPU in system is {memory_to_string(self.gpu_mem)}. No tuning is performed."
)
return
if has_mlflow:
mlflow.end_run()
def tune_space(self,
tuning_space,
......@@ -505,7 +548,7 @@ class Autotuner:
prev_best_mbs=0,
prev_best_metric_val=0):
config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, ZERO_OPTIMIZATION_STAGE_DEFAULT)
stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage)
tuning_micro_batch_sizes = []
max_train_batch_size_per_gpu = 0
......@@ -785,11 +828,12 @@ class Autotuner:
self.rm.schedule_experiments(exp_paths)
self.rm.run()
for exp_id, (exp, err) in self.rm.finished_experiments.items():
if exp:
metric_file = exp[DS_CONFIG][AUTOTUNING][AUTOTUNING_METRIC_PATH]
if os.path.exists(metric_file):
with open(metric_file, 'r') as f:
results = hjson.load(f)
metric_val = results[self.metric()]
......@@ -797,11 +841,19 @@ class Autotuner:
if max_micro_batch_size == exp[DS_CONFIG][
TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
max_micro_batch_size_metric_val = metric_val
if has_mlflow:
os.environ.pop('MLFLOW_RUN_ID')
mlflow.start_run(nested=True, run_name=exp['name'])
for metric in results:
mlflow.log_metric(metric, results[metric])
mlflow.end_run()
os.environ['MLFLOW_RUN_ID'] = self.mlflow_parent_id
else:
self.update_records(tuning_space_name, exp, 0, 1)
else:
mbs = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]
logger.info(f"micro batch size = {mbs} was not run successfully")
self.rm.clear()
if tuning_micro_batch_sizes_overwritten:
......@@ -831,7 +883,18 @@ class Autotuner:
self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mbs)
exp, metric_val = self.run_ds_config(ds_config, exp_name)
if metric_val:
with open(metric_file, 'r') as f:
results = hjson.load(f)
metric_val = results[self.metric()]
if has_mlflow:
os.environ.pop('MLFLOW_RUN_ID')
mlflow.start_run(nested=True, run_name=exp_name)
for metric in results:
mlflow.log_metric(metric, results[metric])
mlflow.end_run()
os.environ['MLFLOW_RUN_ID'] = self.mlflow_parent_id
self.update_records(tuning_space_name, exp, metric_val, 1)
if metric_val > prev_best_metric_val * (1 + METRIC_PERCENT_DIFF_CONST):
prev_best_metric_val = metric_val
......@@ -843,7 +906,6 @@ class Autotuner:
break
if prev_best_mbs != max_micro_batch_size:
tuning_micro_batch_sizes[-1] = prev_best_mbs
return tuning_micro_batch_sizes
def get_min_max_micro_batch_size(self,
......@@ -961,11 +1023,10 @@ class Autotuner:
low = min_micro_batch_size
high = max_micro_batch_size
while low < high:
# binary search until low is the smallest micro batch size that OOMs.
while low <= high:
mid = int((low + high) // 2)
logger.debug(f"trying mbs = {mid}, low = {low}, high = {high}")
if mid == low:
break
if mid not in used_micro_batch_sizes:
ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mid
ds_config[TRAIN_BATCH_SIZE] = mid * gas * \
......@@ -973,7 +1034,7 @@ class Autotuner:
exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(mid)
exp, metric_val = self.run_ds_config(ds_config, exp_name)
if metric_val:
low = mid
low = mid + 1
self.update_records(tuning_space_name, exp, metric_val, 1)
used_micro_batch_sizes.append(mid)
if prev_metric_val and ((metric_val - prev_metric_val) /
......@@ -985,8 +1046,8 @@ class Autotuner:
self.update_records(tuning_space_name, exp, 0, 1)
high = mid - 1
else:
low = mid
max_micro_batch_size = low
low = mid + 1
max_micro_batch_size = low - 1
logger.info(
f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}."
......@@ -1084,26 +1145,18 @@ class Autotuner:
json.dump(exp_config, fd)
fd.flush()
os.fsync(fd)
self.rm.schedule_experiments([exp_path])
self.rm.run()
exp, metric_val = self.rm.parse_results(self.metric())
self.rm.clear()
return exp, metric_val
def run_after_tuning(self):
""" Launches the training with the optmimal DeepSpeed configuration found through the autotuning process.
"ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
"""
def write_optimal_config(self):
best_space_records = self.get_best_space_records()
if GLOBAL_TUNING_SPACE not in best_space_records:
return
best_exp, best_metric_val, _ = best_space_records[GLOBAL_TUNING_SPACE]
if best_exp:
logger.info(
"Start training with the optmimal DeepSpeed configuration found through the tuning process"
)
exp_dir = best_exp["result_dir"]
cmd = None
with open(os.path.join(exp_dir, "cmd.txt"), "r") as f:
......@@ -1115,18 +1168,27 @@ class Autotuner:
ds_config_path = os.path.join(self.results_dir, "ds_config_optimal.json")
json.dump(ds_config, open(ds_config_path, "w"))
idx = cmd.index(os.path.join(exp_dir, "ds_config.json"))
cmd[idx] = ds_config_path
cmd_path = os.path.join(self.results_dir, "cmd_optimal.txt")
with open(cmd_path, "w") as fd:
fd.write(" ".join(cmd))
fd.write("\n")
fd.flush()
self.optimal_cmd = cmd
self.optmal_ds_config = ds_config
logger.info(
f"Wrote the optimal DeepSpeed configuration found by autotuning to {ds_config_path}, and the corresponding DeepSpeed command to {cmd_path}"
)
result = subprocess.Popen(cmd)
def run_after_tuning(self):
""" Launches the training with the optimal DeepSpeed configuration found through the autotuning process.
"ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
"""
if self.optimal_cmd:
result = subprocess.Popen(self.optimal_cmd)
result.wait()
logger.info(
f"Done running with the optimal DeepSpeed configuration found by autotuning: {ds_config_path}"
f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}"
)
else:
logger.info(f"No optimal DeepSpeed configuration found by autotuning.")
'''Copyright The Microsoft DeepSpeed Team'''
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
......@@ -41,11 +42,11 @@ class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
self.results_dir = get_scalar_param(autotuning_dict,
AUTOTUNING_RESULTS_DIR,
AUTOTUNING_RESULTS_DIR_DEFAULT)
assert self.results_dir, "results_dir cannot be empty"
self.exps_dir = get_scalar_param(autotuning_dict,
AUTOTUNING_EXPS_DIR,
AUTOTUNING_EXPS_DIR_DEFAULT)
assert self.exps_dir, "exps_dir cannot be empty"
self.overwrite = get_scalar_param(autotuning_dict,
AUTOTUNING_OVERWRITE,
AUTOTUNING_OVERWRITE_DEFAULT)
......
'''Copyright The Microsoft DeepSpeed Team'''
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
......@@ -22,9 +23,6 @@ DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__f
"config_templates",
"template_zero3.json")
DEFAULT_EXPRS_DIR = os.path.join(os.getcwd(), "autotuning_exps")
DEFAULT_RESULTS_DIR = os.path.join(os.getcwd(), "autotuning_results")
METRIC_PERCENT_DIFF_CONST = 0.05
DS_CONFIG = "ds_config"
BUFSIZE = 1 # line buffer size for writing files
......@@ -54,10 +52,10 @@ AUTOTUNING_FAST = "fast"
AUTOTUNING_FAST_DEFAULT = True
AUTOTUNING_RESULTS_DIR = "results_dir"
AUTOTUNING_RESULTS_DIR_DEFAULT = None
AUTOTUNING_RESULTS_DIR_DEFAULT = "autotuning_results"
AUTOTUNING_EXPS_DIR = "exps_dir"
AUTOTUNING_EXPS_DIR_DEFAULT = None
AUTOTUNING_EXPS_DIR_DEFAULT = "autotuning_exps"
AUTOTUNING_OVERWRITE = "overwrite"
AUTOTUNING_OVERWRITE_DEFAULT = True
......
'''Copyright The Microsoft DeepSpeed Team'''
import copy
from re import I
from numpy import BUFSIZE
from deepspeed.env_report import SUCCESS
from enum import Flag
import json
import os
import subprocess
import sys
import threading
import time
from pathlib import Path
from typing import List
import base64
import os
import hjson
from tqdm import tqdm
from ..utils import logger
from .constants import *
from .constants import AUTOTUNING, AUTOTUNING_METRIC_PATH
from .utils import get_val_by_key, search_error, was_interruptted
"""
......@@ -25,9 +22,7 @@ thread-0: loop over experiment queue dispatching experiments if they become avai
thread-N: start each experiment in its own thread
"""
import torch.distributed as dist
from datetime import datetime
from deepspeed import comm as dist
TIMEOUT = 5
......@@ -188,7 +183,6 @@ class ResourceManager:
logger.debug(f'Put exp_id = {exp["exp_id"]} back into the queue')
self.experiment_check(pbar)
else:
desc = ""
for reservation in reservations:
reservation.slots.sort()
......@@ -344,19 +338,27 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
exp["job_id"] = get_job_id()
exp_dir = exp["result_dir"]
os.makedirs(exp_dir, exist_ok=True)
exp["ds_config_path"] = os.path.join(exp_dir, "ds_config.json")
ds_config_path = os.path.join(exp_dir, "ds_config.json")
exp["ds_config_path"] = ds_config_path
ds_config = copy.deepcopy(exp["ds_config"])
ds_config_json = json.dumps(ds_config).encode('utf-8')
exp["ds_config_base64"] = base64.urlsafe_b64encode(ds_config_json).decode('utf-8')
with open(exp["ds_config_path"], "w", buffering=BUFSIZE) as fd:
json.dump(ds_config, fd)
fd.flush()
os.fsync(fd)
path = exp["ds_config_path"]
logger.info(f"Scheduler wrote ds_config to {path}, {os.path.abspath(path)}")
with open(os.path.join(exp_dir, "exp.json"), "w", buffering=BUFSIZE) as fd:
json.dump(exp, fd)
fd.flush()
os.fsync(fd)
path = os.path.join(exp_dir, "exp.json")
logger.info(f"Scheduler wrote exp to {path}, {os.path.abspath(path)}")
# remove "--deepspeed_config ds_config.json" from user_args
if user_args:
......@@ -365,9 +367,10 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
# "--deepspeed_config" is omitted in HF
elif "--deepspeed" in user_args:
idx = user_args.index("--deepspeed")
assert idx < len(user_args) and ".json" in user_args[idx +
1], "there is no ds_config file specified after --deepspeed_config or --deepspeed"
user_args[idx + 1] = exp["ds_config_path"]
assert idx < len(user_args), "there is no ds_config file specified after --deepspeed_config or --deepspeed"
# user_args[idx + 1] = exp["ds_config_path"]
# pass base64 serialized ds_config to launcher
user_args[idx + 1] = exp["ds_config_base64"]
exp["user_script"] = user_script
exp["user_args"] = user_args
......@@ -382,7 +385,9 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
fd.flush()
os.fsync(fd)
logger.info(f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}")
logger.info(
f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}, and ds_config = {os.path.abspath(ds_config_path)}"
)
with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(
os.path.join(exp_dir, "stderr.log"), "wb"
......@@ -396,7 +401,9 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
clean_up(exp, reservations)
logger.info(f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}")
logger.info(
f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}"
)
PDSH_MAX_FAN_OUT = 1024
......
'''Copyright The Microsoft DeepSpeed Team'''
from .index_based_tuner import RandomTuner, GridSearchTuner
# from .ga_tuner import GATuner
from .model_based_tuner import ModelBasedTuner
import atexit
'''Copyright The Microsoft DeepSpeed Team'''
import sys
from deepspeed.autotuning.constants import *
from deepspeed.autotuning.utils import write_experiments
from deepspeed.utils import logger
import json
class BaseTuner:
def __init__(self, exps, resource_manager, metric):
......
import numpy as np
'''Copyright The Microsoft DeepSpeed Team'''
from .utils import *
......
import random
'''Copyright The Microsoft DeepSpeed Team'''
from deepspeed.utils import logger
import random
from .base_tuner import BaseTuner
......
'''Copyright The Microsoft DeepSpeed Team'''
import hjson
import numpy as np
from deepspeed.utils import logger
from ..constants import AUTOTUNING, AUTOTUNING_METRIC_PATH, AUTOTUNING_METRIC_DEFAULT
from ..constants import AUTOTUNING, AUTOTUNING_METRIC_PATH
from .base_tuner import BaseTuner
from .cost_model import XGBoostCostModel
from .utils import *
......
'''Copyright The Microsoft DeepSpeed Team'''
import numpy as np
import itertools
from ..utils import *
......
'''Copyright The Microsoft DeepSpeed Team'''
import re
import collections.abc
import os
import json
from deepspeed.runtime.constants import GRADIENT_ACCUMULATION_STEPS, TRAIN_MICRO_BATCH_SIZE_PER_GPU
import hjson
import sys
import itertools
import copy
......@@ -35,23 +35,11 @@ def was_interruptted(filename):
return False
def was_interruptted(filename):
if not os.path.exists(filename):
return "stderr.log does not exist"
with open(filename) as f:
for line in f:
s = "KeyboardInterrupt"
idx = line.find(s)
if idx != -1:
return True
return False
def find_replace_str(value, replace_dict):
if not isinstance(value, str):
return str(value)
matches = re.findall("\$[A-Za-z0-9_]+", value)
matches = re.findall(r"\$[A-Za-z0-9_]+", value)
for var in matches:
var_key = var.replace("$", "").lower()
if var_key == "nvme_path":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment