Unverified Commit 41db1c2f authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files
parent 79093d74
This diff is collapsed.
#include "custom_cuda_layers.h"
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
const float4* input_cast = reinterpret_cast<const float4*>(input);
float2* output_cast = reinterpret_cast<float2*>(output);
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) {
float4 data = input_cast[id];
float2 cast_data;
__half* output_h = reinterpret_cast<__half*>(&cast_data);
output_h[0] = (__half)data.x;
output_h[1] = (__half)data.y;
output_h[2] = (__half)data.z;
output_h[3] = (__half)data.w;
output_cast[id] = cast_data;
}
}
void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
{
int threads = 512;
size /= 4;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
}
#pragma once
#include <cpuid.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <x86intrin.h>
#include <cassert>
#include "context.h"
#include "cublas_v2.h"
#include "cuda.h"
#include "curand.h"
#define CUDA_CHECK(callstr) \
{ \
cudaError_t error_code = callstr; \
if (error_code != cudaSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define TILE (1024 * 1024 * 1024)
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#else
#if defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#endif
#endif
class Adam_Optimizer {
public:
Adam_Optimizer(float alpha = 1e-3,
float betta1 = 0.9,
float betta2 = 0.999,
float eps = 1e-8,
float weight_decay = 0)
: _alpha(alpha),
_betta1(betta1),
_betta2(betta2),
_eps(eps),
_weight_decay(weight_decay),
_betta1_t(1.0),
_betta2_t(1.0),
_buf_index(false)
{
cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
}
~Adam_Optimizer()
{
cudaFreeHost(_doubled_buffer[0]);
cudaFreeHost(_doubled_buffer[1]);
}
void Step(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t param_size,
__half* dev_param = nullptr);
void Step_4(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sa,
size_t param_size,
__half* dev_param = nullptr);
void Step_8(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params = nullptr);
inline void IncrementStep()
{
_betta1_t *= _betta1;
_betta2_t *= _betta2;
}
private:
#if defined(__AVX512__) or defined(__AVX256__)
union AVX_Data {
#if defined(__AVX512__)
__m512 data;
#else
__m256 data;
#endif
// float data_f[16];
};
#endif
float _alpha;
float _betta1;
float _betta2;
float _eps;
float _weight_decay;
float _betta1_t;
float _betta2_t;
float* _doubled_buffer[2];
bool _buf_index;
};
...@@ -264,3 +264,5 @@ void launch_fuse_transpose_bias_kernel(const T* inp, ...@@ -264,3 +264,5 @@ void launch_fuse_transpose_bias_kernel(const T* inp,
int rows, int rows,
int cols, int cols,
cudaStream_t stream); cudaStream_t stream);
void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
...@@ -4,16 +4,18 @@ Copyright 2020 The Microsoft DeepSpeed Team ...@@ -4,16 +4,18 @@ Copyright 2020 The Microsoft DeepSpeed Team
import sys import sys
import types import types
from deepspeed.runtime.engine import DeepSpeedEngine from . import ops
from deepspeed.runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
from deepspeed.runtime.lr_schedules import add_tuning_arguments from .runtime.engine import DeepSpeedEngine
from deepspeed.runtime.config import DeepSpeedConfig from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER, DEEPSPEED_ADAM
from deepspeed.runtime.activation_checkpointing import checkpointing from .runtime.lr_schedules import add_tuning_arguments
from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig from .runtime.config import DeepSpeedConfig
from deepspeed.utils import logger from .runtime.activation_checkpointing import checkpointing
from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
from .utils import logger
try: try:
from deepspeed.git_version_info import version, git_hash, git_branch from .git_version_info import version, git_hash, git_branch
except ImportError: except ImportError:
version = "0.0.0+unknown" version = "0.0.0+unknown"
git_hash = None git_hash = None
......
from ..git_version_info import installed_ops as __installed_ops__
from . import lamb
from . import transformer
if __installed_ops__['sparse-attn']:
from . import sparse_attention
if __installed_ops__['cpu-adam']:
from . import adam
from .cpu_adam import DeepSpeedCPUAdam
import math
import torch
import importlib
ds_opt_adam = None
class DeepSpeedCPUAdam(torch.optim.Optimizer):
optimizer_id = 0
def __init__(self,
model_params,
lr=1e-3,
betas=(0.9,
0.999),
eps=1e-8,
weight_decay=0,
amsgrad=False):
default_args = dict(lr=lr,
betas=betas,
eps=eps,
weight_decay=weight_decay,
amsgrad=amsgrad)
super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
self.opt_id = DeepSpeedCPUAdam.optimizer_id
DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
global ds_opt_adam
ds_opt_adam = importlib.import_module('deepspeed.ops.adam.cpu_adam_op')
ds_opt_adam.create_adam(self.opt_id, lr, betas[0], betas[1], eps, weight_decay)
def __setstate__(self, state):
super(DeepSpeedCPUAdam, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
@torch.no_grad()
def step(self, closure=None, fp16_param_groups=None):
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()
for group_id, group in enumerate(self.param_groups):
for param_id, p in enumerate(group['params']):
if p.grad is None:
continue
grad = p.grad.data
state = self.state[p]
# State initialization
if len(state) == 0:
print(f'group {group_id} param {param_id} = {p.numel()}')
state['step'] = 0
# gradient momentums
state['exp_avg'] = torch.zeros_like(p.data, device='cpu')
# gradient variances
state['exp_avg_sq'] = torch.zeros_like(p.data, device='cpu')
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
state['step'] += 1
if fp16_param_groups is not None:
p_fp16 = fp16_param_groups[group_id][param_id]
ds_opt_adam.adam_update_copy(self.opt_id,
p.data,
grad,
exp_avg,
exp_avg_sq,
p_fp16)
else:
ds_opt_adam.adam_update(self.opt_id,
p.data,
grad,
exp_avg,
exp_avg_sq)
return loss
import torch
from torch.autograd import Variable
import collections
def async_migrate_to(obj, dev, main_stream=None):
if torch.is_tensor(obj):
obj = Variable(obj)
if isinstance(obj, Variable):
v = obj.cuda(dev, async=True)
if main_stream is not None:
v.data.record_stream(main_stream)
return v
elif isinstance(obj, collections.Mapping):
return {k: async_copy_to(o, dev, main_stream) for k, o in obj.items()}
elif isinstance(obj, collections.Sequence):
return [async_copy_to(o, dev, main_stream) for o in obj]
else:
return obj
def async_copy_to(obj, dev, main_stream=None):
if torch.is_tensor(obj):
obj = Variable(obj)
if isinstance(obj, Variable):
target = torch.empty_like(obj, device=dev).copy_(obj)
if main_stream is not None:
target.data.record_stream(main_stream)
return target
elif isinstance(obj, collections.Mapping):
return {k: async_copy_to(o, dev, main_stream) for k, o in obj.items()}
elif isinstance(obj, collections.Sequence):
return [async_copy_to(o, dev, main_stream) for o in obj]
...@@ -10,14 +10,21 @@ from deepspeed.runtime.constants import * ...@@ -10,14 +10,21 @@ from deepspeed.runtime.constants import *
from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, DELAYED_SHIFT, MIN_LOSS_SCALE from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, DELAYED_SHIFT, MIN_LOSS_SCALE
from deepspeed.runtime.config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys from deepspeed.runtime.config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys
from deepspeed.runtime.zero.config import DeepSpeedZeroConfig from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
from deepspeed.runtime.zero.constants import *
from deepspeed.runtime.activation_checkpointing.config import DeepSpeedActivationCheckpointingConfig from deepspeed.runtime.activation_checkpointing.config import DeepSpeedActivationCheckpointingConfig
from deepspeed.utils import logger from deepspeed.utils import logger
TENSOR_CORE_ALIGN_SIZE = 8 TENSOR_CORE_ALIGN_SIZE = 8
ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
ADAM_OPTIMIZER = 'adam' ADAM_OPTIMIZER = 'adam'
LAMB_OPTIMIZER = 'lamb' LAMB_OPTIMIZER = 'lamb'
DEEPSPEED_OPTIMIZERS = [ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER] ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
DEEPSPEED_ADAM = 'deepspeed_adam'
DEEPSPEED_OPTIMIZERS = [
ADAM_OPTIMIZER,
LAMB_OPTIMIZER,
ONEBIT_ADAM_OPTIMIZER,
DEEPSPEED_ADAM
]
def get_amp_enabled(param_dict): def get_amp_enabled(param_dict):
...@@ -111,22 +118,9 @@ def get_zero_optimization(param_dict): ...@@ -111,22 +118,9 @@ def get_zero_optimization(param_dict):
def get_zero_reduce_scatter(param_dict): def get_zero_reduce_scatter(param_dict):
return get_scalar_param(param_dict, ZERO_REDUCE_SCATTER, ZERO_REDUCE_SCATTER_DEFAULT)
def get_zero_max_elements_per_comm(param_dict):
return get_scalar_param(param_dict, return get_scalar_param(param_dict,
ZERO_MAX_ELEMENTS_PER_COMM, ZERO_OPTIMIZATION_REDUCE_SCATTER,
ZERO_MAX_ELEMENTS_PER_COMM_DEFAULT) ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT)
def get_allgather_size(param_dict):
return get_scalar_param(param_dict,
ALLGATHER_SIZE,
ALLGATHER_SIZE_DEFAULT) if get_scalar_param(
param_dict,
ALLGATHER_SIZE,
ALLGATHER_SIZE_DEFAULT) > 0 else ALLGATHER_SIZE_DEFAULT
def get_allreduce_always_fp32(param_dict): def get_allreduce_always_fp32(param_dict):
...@@ -493,8 +487,6 @@ class DeepSpeedConfig(object): ...@@ -493,8 +487,6 @@ class DeepSpeedConfig(object):
self.gradient_predivide_factor = get_gradient_predivide_factor(param_dict) self.gradient_predivide_factor = get_gradient_predivide_factor(param_dict)
self.sparse_gradients_enabled = get_sparse_gradients_enabled(param_dict) self.sparse_gradients_enabled = get_sparse_gradients_enabled(param_dict)
self.allgather_size = get_allgather_size(param_dict)
self.zero_config = DeepSpeedZeroConfig(param_dict) self.zero_config = DeepSpeedZeroConfig(param_dict)
self.zero_optimization_stage = self.zero_config.stage self.zero_optimization_stage = self.zero_config.stage
self.zero_enabled = self.zero_optimization_stage > 0 self.zero_enabled = self.zero_optimization_stage > 0
...@@ -628,15 +620,18 @@ class DeepSpeedConfig(object): ...@@ -628,15 +620,18 @@ class DeepSpeedConfig(object):
':')))) ':'))))
def _do_error_check(self): def _do_error_check(self):
if self.zero_enabled:
assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
assert self.train_micro_batch_size_per_gpu, "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU) assert self.train_micro_batch_size_per_gpu, "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
assert self.gradient_accumulation_steps, 'DeepSpeedConfig: {} is not defined'.format( assert self.gradient_accumulation_steps, "DeepSpeedConfig: {} is not defined".format(
GRADIENT_ACCUMULATION_STEPS) GRADIENT_ACCUMULATION_STEPS)
if self.zero_enabled:
assert self.fp16_enabled, "DeepSpeedConfig: ZeRO is only supported if fp16 is enabled"
assert self.zero_optimization_stage <= MAX_STAGE_ZERO_OPTIMIZATION, "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(MAX_STAGE_ZERO_OPTIMIZATION)
if self.zero_config.cpu_offload is True:
assert self.zero_optimization_stage == ZERO_OPTIMIZATION_GRADIENTS, "DeepSpeedConfig: cpu-offload supported ZeRO stage is {}".format(ZERO_OPTIMIZATION_GRADIENTS)
#assert self.gradient_accumulation_steps == 1, "DeepSpeedConfig: {}is not supported for {}".format(GRADIENT_ACCUMULATION_STEPS, ZERO_OPTIMIZATION_CPU_OFFLOAD)
def _do_warning_check(self): def _do_warning_check(self):
fp16_enabled = self.fp16_enabled or self.zero_enabled fp16_enabled = self.fp16_enabled or self.zero_enabled
......
...@@ -183,35 +183,6 @@ Gradient clipping should be enabled as: ...@@ -183,35 +183,6 @@ Gradient clipping should be enabled as:
GRADIENT_CLIPPING = 'gradient_clipping' GRADIENT_CLIPPING = 'gradient_clipping'
GRADIENT_CLIPPING_DEFAULT = 0. GRADIENT_CLIPPING_DEFAULT = 0.
#########################################
# ZeRO optimization
#########################################
# ZeRO optimization. By default, this optimization is not enabled.
# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
ZERO_FORMAT = '''
ZeRO optimization should be enabled as:
"session_params": {
"zero_optimization": [0|1|2],
"zero_all_gather_size": 200
}
'''
ZERO_OPTIMIZATION = 'zero_optimization'
ZERO_OPTIMIZATION_DEFAULT = 0
ZERO_OPTIMIZATION_OPTIMIZER_STATES = 1
ZERO_OPTIMIZATION_GRADIENTS = 2
ZERO_OPTIMIZATION_WEIGHTS = 3
MAX_STAGE_ZERO_OPTIMIZATION = ZERO_OPTIMIZATION_GRADIENTS
ZERO_REDUCE_SCATTER = "zero_reduce_scatter"
ZERO_REDUCE_SCATTER_DEFAULT = True
ZERO_MAX_ELEMENTS_PER_COMM = "zero_max_elements_per_comm"
ZERO_MAX_ELEMENTS_PER_COMM_DEFAULT = 5e8
ALLGATHER_SIZE = 'allgather_size'
ALLGATHER_SIZE_DEFAULT = 500000000
######################################### #########################################
# FP32 AllReduce # FP32 AllReduce
######################################### #########################################
......
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
import warnings import warnings
import torch.distributed as dist import torch.distributed as dist
import apex
from apex import amp from apex import amp
from torch.nn.modules import Module from torch.nn.modules import Module
from torch.distributed.distributed_c10d import _get_global_rank from torch.distributed.distributed_c10d import _get_global_rank
...@@ -14,20 +15,20 @@ from tensorboardX import SummaryWriter ...@@ -14,20 +15,20 @@ from tensorboardX import SummaryWriter
from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1 from deepspeed.runtime.zero.stage1 import FP16_DeepSpeedZeroOptimizer_Stage1
from deepspeed.runtime.zero.utils import is_zero_supported_optimizer
from deepspeed.runtime.activation_checkpointing import checkpointing as activation_checkpointing from deepspeed.runtime.activation_checkpointing import checkpointing as activation_checkpointing
from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
from deepspeed.runtime.config import DeepSpeedConfig, \ from deepspeed.runtime.config import DeepSpeedConfig, \
ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, DEEPSPEED_OPTIMIZERS ADAM_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, DEEPSPEED_ADAM, DEEPSPEED_OPTIMIZERS
from deepspeed.runtime.dataloader import DeepSpeedDataLoader from deepspeed.runtime.dataloader import DeepSpeedDataLoader
from deepspeed.runtime.constants import \ from deepspeed.runtime.constants import \
ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \ ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
TORCH_DISTRIBUTED_DEFAULT_PORT, \ TORCH_DISTRIBUTED_DEFAULT_PORT
from deepspeed.runtime.zero.constants import \
ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS
from deepspeed.runtime.csr_tensor import CSRTensor from deepspeed.runtime.csr_tensor import CSRTensor
import deepspeed.runtime.lr_schedules as lr_schedules import deepspeed.runtime.lr_schedules as lr_schedules
from deepspeed.utils import logger from deepspeed.utils import logger
from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
...@@ -105,7 +106,6 @@ class DeepSpeedEngine(Module): ...@@ -105,7 +106,6 @@ class DeepSpeedEngine(Module):
collate_fn=None, collate_fn=None,
config_params=None): config_params=None):
super(DeepSpeedEngine, self).__init__() super(DeepSpeedEngine, self).__init__()
self.client_optimizer = optimizer self.client_optimizer = optimizer
self.client_model_parameters = model_parameters self.client_model_parameters = model_parameters
self.client_lr_scheduler = lr_scheduler self.client_lr_scheduler = lr_scheduler
...@@ -266,7 +266,7 @@ class DeepSpeedEngine(Module): ...@@ -266,7 +266,7 @@ class DeepSpeedEngine(Module):
return self._config.train_micro_batch_size_per_gpu return self._config.train_micro_batch_size_per_gpu
def optimizer_name(self): def optimizer_name(self):
return self._config.optimizer_name return self.client_optimizer.__class__.__name__ if self.client_optimizer else self._config.optimizer_name
def optimizer_params(self): def optimizer_params(self):
return self._config.optimizer_params return self._config.optimizer_params
...@@ -292,6 +292,9 @@ class DeepSpeedEngine(Module): ...@@ -292,6 +292,9 @@ class DeepSpeedEngine(Module):
def zero_overlap_comm(self): def zero_overlap_comm(self):
return self._config.zero_config.overlap_comm return self._config.zero_config.overlap_comm
def zero_cpu_offload(self):
return self._config.zero_config.cpu_offload
def zero_optimization_stage(self): def zero_optimization_stage(self):
return self._config.zero_optimization_stage return self._config.zero_optimization_stage
...@@ -310,9 +313,6 @@ class DeepSpeedEngine(Module): ...@@ -310,9 +313,6 @@ class DeepSpeedEngine(Module):
def zero_load_from_fp32_weights(self): def zero_load_from_fp32_weights(self):
return self._config.zero_config.load_from_fp32_weights return self._config.zero_config.load_from_fp32_weights
def allgather_size(self):
return self._config.allgather_size
def fp16_enabled(self): def fp16_enabled(self):
return self._config.fp16_enabled return self._config.fp16_enabled
...@@ -491,6 +491,7 @@ class DeepSpeedEngine(Module): ...@@ -491,6 +491,7 @@ class DeepSpeedEngine(Module):
# Configure optimizer # Configure optimizer
def _configure_optimizer(self, client_optimizer, model_parameters): def _configure_optimizer(self, client_optimizer, model_parameters):
if client_optimizer is not None: if client_optimizer is not None:
basic_optimizer = client_optimizer basic_optimizer = client_optimizer
logger.info('Using client Optimizer as basic optimizer') logger.info('Using client Optimizer as basic optimizer')
...@@ -504,13 +505,14 @@ class DeepSpeedEngine(Module): ...@@ -504,13 +505,14 @@ class DeepSpeedEngine(Module):
if self.zero_optimization(): if self.zero_optimization():
assert not self.amp_enabled(), "Amp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2" assert not self.amp_enabled(), "Amp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2"
if self.optimizer_name() != ADAM_OPTIMIZER: if not is_zero_supported_optimizer(basic_optimizer):
assert self.zero_allow_untested_optimizer(), \ assert self.zero_allow_untested_optimizer(), \
'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.' 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
logger.warning( logger.warning(
"**** You are using ZeRO with an untested optimizer, proceed with caution *****" "**** You are using ZeRO with an untested optimizer, proceed with caution *****"
) )
self.optimizer = self._configure_zero_optimizer(basic_optimizer) self.optimizer = self._configure_zero_optimizer(basic_optimizer)
elif self.amp_enabled(): elif self.amp_enabled():
assert not self.fp16_enabled(), "Cannot enable both amp with (legacy) fp16 mode" assert not self.fp16_enabled(), "Cannot enable both amp with (legacy) fp16 mode"
...@@ -522,8 +524,8 @@ class DeepSpeedEngine(Module): ...@@ -522,8 +524,8 @@ class DeepSpeedEngine(Module):
self.optimizer = self._configure_fp16_optimizer(basic_optimizer) self.optimizer = self._configure_fp16_optimizer(basic_optimizer)
else: else:
self.optimizer = basic_optimizer self.optimizer = basic_optimizer
logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer))
# logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer.state_dict())) logger.info('DeepSpeed Final Optimizer = {}'.format(self.optimizer.state_dict()))
def _configure_basic_optimizer(self, model_parameters): def _configure_basic_optimizer(self, model_parameters):
optimizer_parameters = self.optimizer_params() optimizer_parameters = self.optimizer_params()
...@@ -533,8 +535,14 @@ class DeepSpeedEngine(Module): ...@@ -533,8 +535,14 @@ class DeepSpeedEngine(Module):
"'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details" "'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
) )
if self.optimizer_name() == ADAM_OPTIMIZER: if self.optimizer_name() == ADAM_OPTIMIZER:
from apex.optimizers.fused_adam import FusedAdam if self.zero_cpu_offload():
optimizer = FusedAdam(model_parameters, **optimizer_parameters) optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters)
else:
from apex.optimizers.fused_adam import FusedAdam
optimizer = FusedAdam(model_parameters, **optimizer_parameters)
elif self.optimizer_name() == DEEPSPEED_ADAM:
from deepspeed.ops.adam import DeepSpeedCPUAdam
optimizer = DeepSpeedCPUAdam(model_parameters, **optimizer_parameters)
elif self.optimizer_name() == LAMB_OPTIMIZER: elif self.optimizer_name() == LAMB_OPTIMIZER:
from deepspeed.ops.lamb import FusedLamb from deepspeed.ops.lamb import FusedLamb
optimizer = FusedLamb(model_parameters, **optimizer_parameters) optimizer = FusedLamb(model_parameters, **optimizer_parameters)
...@@ -550,8 +558,9 @@ class DeepSpeedEngine(Module): ...@@ -550,8 +558,9 @@ class DeepSpeedEngine(Module):
initial_dynamic_scale = self.initial_dynamic_scale() initial_dynamic_scale = self.initial_dynamic_scale()
dynamic_loss_args = self.dynamic_loss_scale_args() dynamic_loss_args = self.dynamic_loss_scale_args()
clip_grad = self.gradient_clipping() clip_grad = self.gradient_clipping()
if self.optimizer_name() == ADAM_OPTIMIZER or self.optimizer_name( if isinstance(optimizer,
) == ONEBIT_ADAM_OPTIMIZER: apex.optimizers.FusedAdam) or self.optimizer_name(
) == ONEBIT_ADAM_OPTIMIZER:
if self.dynamic_loss_scale(): if self.dynamic_loss_scale():
logger.info('Creating fp16 optimizer with dynamic loss scale') logger.info('Creating fp16 optimizer with dynamic loss scale')
timers = self.timers if self.wall_clock_breakdown() else None timers = self.timers if self.wall_clock_breakdown() else None
...@@ -616,9 +625,11 @@ class DeepSpeedEngine(Module): ...@@ -616,9 +625,11 @@ class DeepSpeedEngine(Module):
dp_process_group=self.data_parallel_group, dp_process_group=self.data_parallel_group,
reduce_scatter=self.zero_reduce_scatter(), reduce_scatter=self.zero_reduce_scatter(),
overlap_comm=self.zero_overlap_comm(), overlap_comm=self.zero_overlap_comm(),
cpu_offload=self.zero_cpu_offload(),
mpu=self.mpu, mpu=self.mpu,
postscale_gradients=self.postscale_gradients(), postscale_gradients=self.postscale_gradients(),
gradient_predivide_factor=self.gradient_predivide_factor()) gradient_predivide_factor=self.gradient_predivide_factor(),
gradient_accumulation_steps=self.gradient_accumulation_steps())
else: else:
raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage)) raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage))
...@@ -724,7 +735,6 @@ class DeepSpeedEngine(Module): ...@@ -724,7 +735,6 @@ class DeepSpeedEngine(Module):
return loss return loss
def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
#Zero stage 2 communicates during non gradient accumulation boundaries as well #Zero stage 2 communicates during non gradient accumulation boundaries as well
if self.zero_optimization_partition_gradients(): if self.zero_optimization_partition_gradients():
self.optimizer.overlapping_partition_gradients_reduce_epilogue() self.optimizer.overlapping_partition_gradients_reduce_epilogue()
...@@ -780,6 +790,8 @@ class DeepSpeedEngine(Module): ...@@ -780,6 +790,8 @@ class DeepSpeedEngine(Module):
self.timers('backward_inner').start() self.timers('backward_inner').start()
if self.zero_optimization(): if self.zero_optimization():
self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
)
self.optimizer.backward(loss) self.optimizer.backward(loss)
elif self.amp_enabled(): elif self.amp_enabled():
# AMP requires delaying unscale when inside gradient accumulation boundaries # AMP requires delaying unscale when inside gradient accumulation boundaries
...@@ -854,7 +866,6 @@ class DeepSpeedEngine(Module): ...@@ -854,7 +866,6 @@ class DeepSpeedEngine(Module):
master_params = amp.master_params(self.optimizer) master_params = amp.master_params(self.optimizer)
torch.nn.utils.clip_grad_norm_(parameters=master_params, torch.nn.utils.clip_grad_norm_(parameters=master_params,
max_norm=self.gradient_clipping()) max_norm=self.gradient_clipping())
self.optimizer.step() self.optimizer.step()
#zero grad in basic optimizer could be unreliable and may not exhibit #zero grad in basic optimizer could be unreliable and may not exhibit
...@@ -957,6 +968,9 @@ class DeepSpeedEngine(Module): ...@@ -957,6 +968,9 @@ class DeepSpeedEngine(Module):
def get_lr(self): def get_lr(self):
return self._get_optimizer_param('lr') return self._get_optimizer_param('lr')
def get_type(self):
return self._get_optimizer_param('type')
def get_mom(self): def get_mom(self):
return self._get_optimizer_param('betas') return self._get_optimizer_param('betas')
......
...@@ -5,79 +5,7 @@ Licensed under the MIT license. ...@@ -5,79 +5,7 @@ Licensed under the MIT license.
from deepspeed.runtime.config_utils import get_scalar_param from deepspeed.runtime.config_utils import get_scalar_param
from deepspeed.utils import logger from deepspeed.utils import logger
from deepspeed.runtime.zero.constants import *
#########################################
# ZeRO optimization
#########################################
# ZeRO optimization. By default, this optimization is not enabled.
# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
ZERO_FORMAT = '''
ZeRO optimization should be enabled as:
"session_params": {
"zero_optimization": {
"stage": [0|1|2],
"allgather_partitions": [true|false],
"allgather_bucket_size": 500000000,
"reduce_scatter": [true|false],
"contiguous_gradients" : [true|false]
"overlap_comm": [true|false],
"reduce_bucket_size": 500000000
"load_from_fp32_weights": [true|false]
}
}
'''
ZERO_OPTIMIZATION = 'zero_optimization'
ZERO_OPTIMIZATION_DISABLED = 0
ZERO_OPTIMIZATION_OPTIMIZER_STATES = 1
ZERO_OPTIMIZATION_GRADIENTS = 2
ZERO_OPTIMIZATION_WEIGHTS = 3
MAX_STAGE_ZERO_OPTIMIZATION = ZERO_OPTIMIZATION_GRADIENTS
ZERO_OPTIMIZATION_STAGE = 'stage'
ZERO_OPTIMIZATION_STAGE_1 = 'stage_1'
ZERO_OPTIMIZATION_STAGE_2 = 'stage_2'
ZERO_OPTIMIZATION_STAGE_3 = 'stage_3'
ZERO_OPTIMIZATION_STAGE_DEFAULT = ZERO_OPTIMIZATION_DISABLED
ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS = 'allgather_partitions'
ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT = True
ZERO_OPTIMIZATION_REDUCE_SCATTER = 'reduce_scatter'
ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = True
ZERO_OPTIMIZATION_OVERLAP_COMM = 'overlap_comm'
ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT = False
ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS = 'contiguous_gradients'
ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT = False
ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE = 'reduce_bucket_size'
ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT = 500000000
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE = 'allgather_bucket_size'
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT = 500000000
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED = 'allgather_size'
ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
ZERO_OPTIMIZATION_DEFAULT = {
ZERO_OPTIMIZATION_STAGE:
ZERO_OPTIMIZATION_STAGE_DEFAULT,
ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS:
ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT,
ZERO_OPTIMIZATION_REDUCE_SCATTER:
ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT,
ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE:
ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT,
ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS:
ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT,
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE:
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT
}
class DeepSpeedZeroConfig(object): class DeepSpeedZeroConfig(object):
...@@ -92,6 +20,7 @@ class DeepSpeedZeroConfig(object): ...@@ -92,6 +20,7 @@ class DeepSpeedZeroConfig(object):
self.allgather_bucket_size = None self.allgather_bucket_size = None
self.overlap_comm = None self.overlap_comm = None
self.load_from_fp32_weights = None self.load_from_fp32_weights = None
self.cpu_offload = None
if ZERO_OPTIMIZATION in param_dict.keys(): if ZERO_OPTIMIZATION in param_dict.keys():
zero_config_dict = param_dict[ZERO_OPTIMIZATION] zero_config_dict = param_dict[ZERO_OPTIMIZATION]
...@@ -156,7 +85,12 @@ class DeepSpeedZeroConfig(object): ...@@ -156,7 +85,12 @@ class DeepSpeedZeroConfig(object):
zero_config_dict, zero_config_dict,
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE, ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE,
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT) ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT)
self.load_from_fp32_weights = get_scalar_param( self.load_from_fp32_weights = get_scalar_param(
zero_config_dict, zero_config_dict,
ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS, ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS,
ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT) ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT)
self.cpu_offload = get_scalar_param(zero_config_dict,
ZERO_OPTIMIZATION_CPU_OFFLOAD,
ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT)
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
"""
#########################################
# ZeRO optimization
#########################################
# ZeRO optimization. By default, this optimization is not enabled.
# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
ZERO_FORMAT = '''
ZeRO optimization should be enabled as:
"session_params": {
"zero_optimization": {
"stage": [0|1|2],
"allgather_partitions": [true|false],
"allgather_bucket_size": 500000000,
"reduce_scatter": [true|false],
"contiguous_gradients" : [true|false]
"overlap_comm": [true|false],
"reduce_bucket_size": 500000000
"load_from_fp32_weights": [true|false]
"cpu_offload": [true|false]
}
}
'''
ZERO_OPTIMIZATION = 'zero_optimization'
ZERO_OPTIMIZATION_DISABLED = 0
ZERO_OPTIMIZATION_OPTIMIZER_STATES = 1
ZERO_OPTIMIZATION_GRADIENTS = 2
ZERO_OPTIMIZATION_WEIGHTS = 3
MAX_STAGE_ZERO_OPTIMIZATION = ZERO_OPTIMIZATION_GRADIENTS
ZERO_OPTIMIZATION_STAGE = 'stage'
ZERO_OPTIMIZATION_STAGE_1 = 'stage_1'
ZERO_OPTIMIZATION_STAGE_2 = 'stage_2'
ZERO_OPTIMIZATION_STAGE_3 = 'stage_3'
ZERO_OPTIMIZATION_STAGE_DEFAULT = ZERO_OPTIMIZATION_DISABLED
ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS = 'allgather_partitions'
ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT = True
ZERO_OPTIMIZATION_REDUCE_SCATTER = 'reduce_scatter'
ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = True
ZERO_OPTIMIZATION_OVERLAP_COMM = 'overlap_comm'
ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT = False
ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS = 'contiguous_gradients'
ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT = False
ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE = 'reduce_bucket_size'
ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT = 500000000
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE = 'allgather_bucket_size'
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT = 500000000
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED = 'allgather_size'
ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
ZERO_OPTIMIZATION_DEFAULT = {
ZERO_OPTIMIZATION_STAGE: ZERO_OPTIMIZATION_STAGE_DEFAULT,
ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS:
ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT,
ZERO_OPTIMIZATION_REDUCE_SCATTER: ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT,
ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE: ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT,
ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS:
ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT,
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE:
ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT,
ZERO_OPTIMIZATION_CPU_OFFLOAD: ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT
}
...@@ -793,8 +793,11 @@ class FP16_DeepSpeedZeroOptimizer_Stage1(object): ...@@ -793,8 +793,11 @@ class FP16_DeepSpeedZeroOptimizer_Stage1(object):
def _get_state_without_padding(self, state_with_padding, padding): def _get_state_without_padding(self, state_with_padding, padding):
lean_state = {} lean_state = {}
for key, value in state_with_padding.items(): for key, value in state_with_padding.items():
lean_length = value.numel() - padding if torch.is_tensor(value):
lean_state[key] = value[:lean_length] lean_length = value.numel() - padding
lean_state[key] = value[:lean_length]
else:
lean_state[key] = value
return lean_state return lean_state
......
This diff is collapsed.
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import apex
from deepspeed.utils import logger from deepspeed.utils import logger
from deepspeed.ops.adam import DeepSpeedCPUAdam
def _initialize_parameter_parallel_groups(parameter_parallel_size=None): def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
...@@ -20,3 +21,17 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None): ...@@ -20,3 +21,17 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
if rank in ranks: if rank in ranks:
my_group = group my_group = group
return my_group return my_group
ZERO_SUPPORTED_OPTIMIZERS = [
torch.optim.Adam,
apex.optimizers.FusedAdam,
DeepSpeedCPUAdam
]
def is_zero_supported_optimizer(optimizer):
print(
f'Checking ZeRO support for optimizer={optimizer.__class__.__name__} type={type(optimizer)}'
)
return type(optimizer) in ZERO_SUPPORTED_OPTIMIZERS
...@@ -162,6 +162,14 @@ Please see the [core API doc](https://deepspeed.readthedocs.io/) for more detail ...@@ -162,6 +162,14 @@ Please see the [core API doc](https://deepspeed.readthedocs.io/) for more detail
With DeepSpeed, the user can choose to use a high performance implementation of ADAM from With DeepSpeed, the user can choose to use a high performance implementation of ADAM from
NVIDIA, or any training optimizer that extends torch's `torch.optim.Optimizer` class. NVIDIA, or any training optimizer that extends torch's `torch.optim.Optimizer` class.
### CPU-Adam: High-Performance vectorized implementation of Adam
We introduce an efficient implementation of Adam optimizer on CPU that improves the parameter-update
performance by nearly an order of magnitude. We use the AVX SIMD instructions on Intel-x86 architecture
for the CPU-Adam implementation. We support both AVX-512 and AVX-2 instruction sets. DeepSpeed uses
AVX-2 by defualt which can be switched to AVX-512 by setting the build flag, `DS_BUILD_AVX512` to 1 when
installing DeepSpeed. Using AVX-512, we observe 5.1x to 6.5x speedups considering the model-size between
1 to 10 billion parameters with respect to torch-adam.
### Memory bandwidth optimized FP16 Optimizer ### Memory bandwidth optimized FP16 Optimizer
Mixed precision training is handled by the DeepSpeed FP16 Optimizer. This optimizer not Mixed precision training is handled by the DeepSpeed FP16 Optimizer. This optimizer not
only handles FP16 training but is also highly efficient. The performance of weight update only handles FP16 training but is also highly efficient. The performance of weight update
......
...@@ -167,6 +167,7 @@ overview](/features/) for descriptions and usage. ...@@ -167,6 +167,7 @@ overview](/features/) for descriptions and usage.
* Automatic loss scaling with mixed precision * Automatic loss scaling with mixed precision
* [Training Optimizers](/features/#training-optimizers) * [Training Optimizers](/features/#training-optimizers)
* Fused Adam optimizer and arbitrary `torch.optim.Optimizer` * Fused Adam optimizer and arbitrary `torch.optim.Optimizer`
* CPU-Adam: High-Performance vectorized Adam
* Memory bandwidth optimized FP16 Optimizer * Memory bandwidth optimized FP16 Optimizer
* Large Batch Training with LAMB Optimizer * Large Batch Training with LAMB Optimizer
* Memory efficient Training with ZeRO Optimizer * Memory efficient Training with ZeRO Optimizer
......
...@@ -164,10 +164,10 @@ if [ ! -f $hostfile ]; then ...@@ -164,10 +164,10 @@ if [ ! -f $hostfile ]; then
local_only=1 local_only=1
fi fi
#if [ "$skip_requirements" == "0" ]; then if [ "$skip_requirements" == "0" ]; then
# # Ensure dependencies are installed locally # Ensure dependencies are installed locally
# $PIP_SUDO $PIP_INSTALL -r requirements.txt $PIP_SUDO $PIP_INSTALL -r requirements/requirements.txt
#fi fi
# Build wheels # Build wheels
if [ "$third_party_install" == "1" ]; then if [ "$third_party_install" == "1" ]; then
...@@ -220,10 +220,10 @@ else ...@@ -220,10 +220,10 @@ else
tmp_wheel_path="/tmp/deepspeed_wheels" tmp_wheel_path="/tmp/deepspeed_wheels"
pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi" pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi"
#pdcp -w $hosts requirements/*.txt ${tmp_wheel_path}/ pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
#if [ "$skip_requirements" == "0" ]; then if [ "$skip_requirements" == "0" ]; then
# pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt" pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt"
#fi fi
if [ "$third_party_install" == "1" ]; then if [ "$third_party_install" == "1" ]; then
pdsh -w $hosts "$PIP_SUDO pip uninstall -y apex" pdsh -w $hosts "$PIP_SUDO pip uninstall -y apex"
pdcp -w $hosts third_party/apex/dist/apex*.whl $tmp_wheel_path/ pdcp -w $hosts third_party/apex/dist/apex*.whl $tmp_wheel_path/
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment