Commit 4d6ed501 authored by Deyu Fu's avatar Deyu Fu
Browse files

Merge branch 'multi_tensor_sgd' into deyuf/fused_optimizer_v2

parents 690b1f71 9f64bf27
......@@ -124,29 +124,13 @@ def check_optimizers(optimizers):
raise RuntimeError("An incoming optimizer is an instance of {}. ".format(bad_optim_type) +
"The optimizer(s) passed to amp.initialize() must be bare \n"
"instances of either ordinary Pytorch optimizers, or Apex fused \n"
"optimizers (currently just FusedAdam, but FusedSGD will be added \n"
"soon). You should not manually wrap your optimizer in either \n"
"optimizers (FusedAdam or FusedSGD). \n"
"You should not manually wrap your optimizer in either \n"
"apex.fp16_utils.FP16_Optimizer or apex.optimizers.FP16_Optimizer. \n"
"amp.initialize will take care of that for you (if necessary) based \n"
"on the specified opt_level (and optional overridden properties).")
def wrap_fused_adam(optimizer, properties):
msg = 'Currently, the usage of FusedAdam is restricted to '\
'amp.initialize(..., opt_level="O2", keep_batchnorm_fp32=False, '\
'loss_scale=float or "dynamic"). We are working on enabling more general usage.'
assert properties.master_weights is True, msg
assert properties.cast_model_type is torch.float16, msg
assert (properties.keep_batchnorm_fp32 is False or
properties.keep_batchnorm_fp32 is None), msg
if properties.loss_scale == "dynamic":
return FP16_Optimizer_for_fused(optimizer, dynamic_loss_scale=True)
else:
return FP16_Optimizer_for_fused(optimizer, static_loss_scale=properties.loss_scale)
def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None):
from apex.parallel import DistributedDataParallel as apex_DDP
from .amp import init as amp_init
......@@ -176,7 +160,6 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
if not _amp_state.allow_incoming_model_not_fp32:
check_params_fp32(models)
# In the future, when FP16_Optimizer can be deprecated and master weights can
# become an attribute, remember to stash master weights before casting the model.
......@@ -223,10 +206,6 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
model.forward = patch_forward(model.forward)
for i, optimizer in enumerate(optimizers):
# Still need to special case this for the first pass
if isinstance(optimizer, FusedAdam):
optimizers[i] = wrap_fused_adam(optimizer, properties)
else:
optimizers[i] = _process_optimizer(optimizer, properties)
_amp_state.loss_scalers = []
......
......@@ -3,6 +3,7 @@ from ..fp16_utils import master_params_to_model_params
from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import maybe_print
import torch
from ..optimizers import FusedAdam, FusedSGD
class AmpOptimizerState(object):
......@@ -10,6 +11,20 @@ class AmpOptimizerState(object):
pass
def _master_params_to_model_params(self):
stash = self._amp_stash
if multi_tensor_applier.available:
if len(stash.all_fp16_params) > 0:
multi_tensor_applier(
stash.multi_tensor_scale,
stash.dummy_overflow_buf,
[stash.all_fp32_from_fp16_params, stash.all_fp16_params],
1.0)
else:
for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
master_params_to_model_params(fp16_group, fp32_from_fp16_group)
def lazy_init_with_master_weights(self):
stash = self._amp_stash
stash.fp16_groups = []
......@@ -60,6 +75,8 @@ def lazy_init_with_master_weights(self):
for group in stash.fp32_from_fp32_groups:
stash.all_fp32_from_fp32_params += group
# all_fp16_grad_stash is only needed for fused optimizers.
stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
# stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]
......@@ -73,15 +90,55 @@ def lazy_init_with_master_weights(self):
self.load_state_dict(self.state_dict())
def post_backward_models_are_masters(scaler, params, stashed_grads, scale_override=None):
grads_have_scale, stashed_have_scale, out_scale = scaler.loss_scale(), 1.0, 1.0
if scale_override is not None:
grads_have_scale, stashed_have_scale, out_scale = scale_override
# This is a lot of python overhead...
grads_needing_unscale = []
grads_needing_unscale_with_stash = []
stashed = []
for param, stashed_grad in zip(params, stashed_grads):
if param.grad is None and stashed_grad is not None:
param.grad = stashed_grad
elif param.grad is not None and stashed_grad is None:
grads_needing_unscale.append(param.grad)
elif param.grad is not None and stashed_grad is not None:
grads_needing_unscale_with_stash.append(param.grad)
stashed.append(stashed_grad)
else: # param.grad is None and stashed_grad is None
continue
# unscale() implements grads*(1/scale), so "scale" should be grads_have_scale/out_scale.
if len(grads_needing_unscale) > 0:
scaler.unscale(
grads_needing_unscale,
grads_needing_unscale,
None, # unused_scale, currently present to avoid API breakage elsewhere
models_are_masters=True,
scale_override=grads_have_scale/out_scale)
if len(grads_needing_unscale_with_stash) > 0:
scaler.unscale_with_stashed(
grads_needing_unscale_with_stash,
stashed,
grads_needing_unscale_with_stash,
scale_override=(grads_have_scale, stashed_have_scale, out_scale))
# Clear the stash.
for i in range(len(stashed_grads)):
stashed_grads[i] = None
def prepare_backward_with_master_weights(self):
stash = self._amp_stash
if not stash.lazy_init_called:
self._lazy_init_maybe_master_weights()
stash.lazy_init_called = True
self._amp_lazy_init()
for i, param in enumerate(stash.all_fp16_params):
# Set up to leverage grad copy elision:
# Set up to leverage grad copy elision.
# This may behave differently from an unpatched optimizer if zero_grad is used and the param is unused.
param.grad = None
# for i, param in enumerate(stash.all_fp32_from_fp16_params):
......@@ -96,6 +153,8 @@ def prepare_backward_with_master_weights(self):
def post_backward_with_master_weights(self, scaler):
stash = self._amp_stash
self._amp_lazy_init()
# This is a lot of python overhead...
fp16_grads_needing_unscale = []
new_fp32_grads = []
......@@ -129,37 +188,10 @@ def post_backward_with_master_weights(self, scaler):
preexisting_fp32_grads)
# fp32 params can be treated as they would be in the "no_master_weights" case.
grads_needing_unscale = []
grads_needing_unscale_with_stash = []
stashed = []
for param, stashed_grad in zip(stash.all_fp32_from_fp32_params,
stash.all_fp32_from_fp32_grad_stash):
if param.grad is None and stashed_grad is not None:
param.grad = stashed_grad
elif param.grad is not None and stashed_grad is None:
grads_needing_unscale.append(param.grad)
elif param.grad is not None and stashed_grad is not None:
grads_needing_unscale_with_stash.append(param.grad)
stashed.append(stashed_grad)
else: # param.grad is None and stashed_grad is None:
continue
if len(grads_needing_unscale) > 0:
scaler.unscale(
grads_needing_unscale,
grads_needing_unscale,
scaler.loss_scale(),
models_are_masters=True)
if len(grads_needing_unscale_with_stash) > 0:
scaler.unscale_with_stashed(
grads_needing_unscale_with_stash,
stashed,
grads_needing_unscale_with_stash)
# Clear the stash.
for i in range(len(stash.all_fp32_from_fp32_grad_stash)):
stash.all_fp32_from_fp32_grad_stash[i] = None
post_backward_models_are_masters(
scaler,
stash.all_fp32_from_fp32_params,
stash.all_fp32_from_fp32_grad_stash)
def lazy_init_no_master_weights(self):
......@@ -184,9 +216,7 @@ def lazy_init_no_master_weights(self):
def prepare_backward_no_master_weights(self):
stash = self._amp_stash
if not stash.lazy_init_called:
self._lazy_init_maybe_master_weights()
stash.lazy_init_called = True
self._amp_lazy_init()
for i, param in enumerate(stash.all_fp16_params):
stash.all_fp16_grad_stash[i] = param.grad
......@@ -202,55 +232,141 @@ def prepare_backward_no_master_weights(self):
def post_backward_no_master_weights(self, scaler):
stash = self._amp_stash
self._amp_lazy_init()
split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
(stash.all_fp32_params, stash.all_fp32_grad_stash))
for params, stashed_grads in split_types:
# This is a lot of python overhead...
grads_needing_unscale = []
grads_needing_unscale_with_stash = []
stashed = []
for param, stashed_grad in zip(params, stashed_grads):
if param.grad is None and stashed_grad is not None:
param.grad = stashed_grad
elif param.grad is not None and stashed_grad is None:
grads_needing_unscale.append(param.grad)
elif param.grad is not None and stashed_grad is not None:
grads_needing_unscale_with_stash.append(param.grad)
stashed.append(stashed_grad)
else: # param.grad is None and stashed_grad is None
continue
post_backward_models_are_masters(scaler, params, stashed_grads)
if len(grads_needing_unscale) > 0:
scaler.unscale(
grads_needing_unscale,
grads_needing_unscale,
scaler.loss_scale(),
models_are_masters=True)
if len(grads_needing_unscale_with_stash) > 0:
scaler.unscale_with_stashed(
grads_needing_unscale_with_stash,
stashed,
grads_needing_unscale_with_stash)
#####################################################################################
# FusedAdam versions
#####################################################################################
# Clear the stash.
for i in range(len(stashed_grads)):
stashed_grads[i] = None
def prepare_backward_with_master_weights_FusedAdam(self):
stash = self._amp_stash
self._amp_lazy_init()
def _master_params_to_model_params(self):
def post_backward_with_master_weights_FusedAdam(self, scaler):
stash = self._amp_stash
if multi_tensor_applier.available:
if len(stash.all_fp16_params) > 0:
multi_tensor_applier(
stash.multi_tensor_scale,
self._amp_lazy_init()
stash.scale = scaler.loss_scale()
stash.grads = [[param.grad.data for param in group] for group in stash.fp16_groups]
stash.output_params = [[param for param in group] for group in stash.fp16_groups]
norm_groups = []
skip = False
for grad_group in stash.grads:
norm, _ = multi_tensor_applier(
stash.multi_tensor_l2norm,
stash.dummy_overflow_buf,
[stash.all_fp32_from_fp16_params, stash.all_fp16_params],
1.0)
[grad_group],
False)
# Still syncing here for now.
norm = float(norm)
norm_groups.append(norm)
if norm == float('inf') or norm == -float('inf') or norm != norm:
skip = True
if skip:
scaler._overflow_buf.fill_(1.)
scaler._has_overflow = True
stash.grad_norms = norm_groups
def prepare_backward_no_master_weights_FusedAdam(self):
stash = self._amp_stash
self._amp_lazy_init()
def post_backward_no_master_weights_FusedAdam(self, scaler):
stash = self._amp_stash
self._amp_lazy_init()
stash.scale = scaler.loss_scale()
stash.grads = None
stash.output_params = None
stash.grad_norms = None
#####################################################################################
# FusedSGD versions
# Eat this ugly code duplication for now. First make it work, then make it clean.
# It's difficult to anticipate what can be unified between the FusedAdam and FusedSGD
# implementations until I have them both working.
#####################################################################################
# FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params
# outside the kernel, so we must accumulate directly into the model grads.
def prepare_backward_with_master_weights_FusedSGD(self):
if self.materialize_master_grads:
prepare_backward_with_master_weights(self)
else:
for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
master_params_to_model_params(fp16_group, fp32_from_fp16_group)
stash = self._amp_stash
self._amp_lazy_init()
for i, param in enumerate(stash.all_fp16_params):
stash.all_fp16_grad_stash[i] = param.grad
# Set up to leverage grad copy elision:
param.grad = None
for i, param in enumerate(stash.all_fp32_from_fp32_params):
stash.all_fp32_from_fp32_grad_stash[i] = param.grad
# Set up to leverage grad copy elision:
param.grad = None
def post_backward_with_master_weights_FusedSGD(self, scaler):
if self.materialize_master_grads:
post_backward_with_master_weights(self, scaler)
else:
stash = self._amp_stash
self._amp_lazy_init()
grads_have_scale = scaler.loss_scale()
stashed_have_scale = self.most_recent_scale
out_scale = grads_have_scale
if self.scale_set_by_backward:
out_scale = min(grads_have_scale, self.most_recent_scale)
split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
(stash.all_fp32_from_fp32_params, stash.all_fp32_from_fp32_grad_stash))
# unscale_with_stashed() implements grads*1/scale + stashed_grads*1.
# stashed_grads are scaled by self.most_recent_scale.
for params, stashed_grads in split_types:
post_backward_models_are_masters(scaler, params, stashed_grads,
(grads_have_scale, stashed_have_scale, out_scale))
self.most_recent_scale = out_scale
self.scale_set_by_backward = True
def prepare_backward_no_master_weights_FusedSGD(self):
prepare_backward_no_master_weights(self)
def post_backward_no_master_weights_FusedSGD(self, scaler):
post_backward_no_master_weights(self, scaler)
def _amp_lazy_init(self):
stash = self._amp_stash
if not stash.lazy_init_called:
self._lazy_init_maybe_master_weights()
stash.lazy_init_called = True
def _process_optimizer(optimizer, properties):
......@@ -266,7 +382,8 @@ def _process_optimizer(optimizer, properties):
for name in ("_lazy_init_maybe_master_weights",
"_master_params_to_model_params",
"_prepare_amp_backward",
"_post_amp_backward"):
"_post_amp_backward",
"_amp_lazy_init"):
if hasattr(optimizer, name):
raise RuntimeError("Incoming optimizer already has {} defined.".format(name))
......@@ -274,6 +391,7 @@ def _process_optimizer(optimizer, properties):
if multi_tensor_applier.available:
import amp_C
optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
optimizer._amp_stash.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);
if properties.master_weights:
......@@ -288,6 +406,7 @@ def _process_optimizer(optimizer, properties):
if closure is not None:
raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
retval = old_step()
if not (isinstance(self, FusedAdam) or isinstance(self, FusedSGD)):
self._master_params_to_model_params()
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
for param in self._amp_stash.all_fp32_from_fp16_params:
......@@ -298,9 +417,7 @@ def _process_optimizer(optimizer, properties):
old_zero_grad = optimizer.zero_grad
def new_zero_grad(self):
stash = self._amp_stash
if not stash.lazy_init_called:
self._lazy_init_maybe_master_weights()
stash.lazy_init_called = True
self._amp_lazy_init()
# Zero the model grads.
for param in stash.all_fp16_params:
if param.grad is not None:
......@@ -315,21 +432,43 @@ def _process_optimizer(optimizer, properties):
param.grad = None
optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
if isinstance(optimizer, FusedAdam):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_with_master_weights_FusedAdam, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_with_master_weights_FusedAdam, optimizer)
elif isinstance(optimizer, FusedSGD):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_with_master_weights_FusedSGD, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_with_master_weights_FusedSGD, optimizer)
else:
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_with_master_weights, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_with_master_weights, optimizer)
else:
optimizer._lazy_init_maybe_master_weights = types.MethodType(
lazy_init_no_master_weights, optimizer)
if isinstance(optimizer, FusedAdam):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_no_master_weights_FusedAdam, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_no_master_weights_FusedAdam, optimizer)
elif isinstance(optimizer, FusedSGD):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_no_master_weights_FusedSGD, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_no_master_weights_FusedSGD, optimizer)
else:
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_no_master_weights, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_no_master_weights, optimizer)
optimizer._amp_lazy_init = types.MethodType(_amp_lazy_init, optimizer)
old_add_param_group = optimizer.add_param_group
def new_add_param_group(self, new_group):
......
......@@ -6,8 +6,6 @@ from . import utils
from .opt import OptimWrapper
from .scaler import LossScaler
from ._amp_state import _amp_state, master_params, maybe_print
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
from ..parallel.LARC import LARC
......@@ -89,11 +87,6 @@ def scale_loss(loss,
if isinstance(optimizers, torch.optim.Optimizer) or isinstance(optimizers, LARC):
optimizers = [optimizers]
# this is what happens when i have to support tools from different sources under the same API...
# TODO: Rewrite FusedAdam to use multi-tensor apply and the same loss scaler.
if isinstance(optimizers, FP16_Optimizer_for_fused):
loss_scale = optimizers.cur_scale
else:
loss_scaler = _amp_state.loss_scalers[loss_id]
loss_scale = loss_scaler.loss_scale()
......@@ -120,8 +113,8 @@ def scale_loss(loss,
for optimizer in optimizers:
optimizer._amp_stash.params_have_scaled_gradients = True
else:
# FusedAdam and FusedSGD will take care of unscaling as part of their step() methods.
if not isinstance(optimizers, FP16_Optimizer_for_fused):
# FusedAdam and FusedSGD may take care of unscaling as part of their step() methods.
# if not isinstance(optimizers, FP16_Optimizer_for_fused):
loss_scaler.clear_overflow_state()
for optimizer in optimizers:
optimizer._post_amp_backward(loss_scaler)
......@@ -142,10 +135,15 @@ def scale_loss(loss,
maybe_print(("Gradient overflow. Skipping step, loss scaler " +
"{} reducing loss scale to {}").format(loss_id,
loss_scaler.loss_scale()))
# TODO: I don't like the special casing for different optimizer implementations.
# Maybe skip should delegate to a method owned by the optimizers themselves.
if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
for param in opt._amp_stash.all_fp32_from_fp16_params:
param.grad = None
if hasattr(opt, "most_recent_scale"):
opt.most_recent_scale = 1.0
opt.scale_set_by_backward = False
opt.step = opt_step
opt._amp_stash.already_patched = False
return skip_step
......
......@@ -16,7 +16,7 @@ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=F
master_grad.mul_(scale)
return False
def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, scale, check_overflow=False):
def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a, b, check_overflow=False):
# Exception handling for 18.04 compatibility
if check_overflow:
cpu_sum = float(model_grad.float().sum())
......@@ -26,9 +26,8 @@ def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, scale, ch
# if master_grad is not model_grad: # copy_ probably internally short-circuits this
# master_grad.copy_(model_grad)
assert stashed_grad.dtype == master_grad.dtype
converted_model_grad = model_grad.to(master_grad.dtype)
stashed_grad.add_(scale, converted_model_grad)
master_grad.data = stashed_grad.data
converted_model_grad = model_grad.data.to(master_grad.dtype)
master_grad.data = a*converted_model_grad.data + b*stashed_grad.data
return False
class LossScaler(object):
......@@ -92,11 +91,13 @@ class LossScaler(object):
break
# unused_scale keeps some of the old API alive for hopefully a short time.
def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False):
def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False, scale_override=None):
if self._has_overflow:
return
scale = self._loss_scale
if scale_override is not None:
scale = scale_override
if scale == 1.0 and models_are_masters and not self.dynamic:
return
......@@ -126,7 +127,8 @@ class LossScaler(object):
model_grads,
stashed_master_grads,
master_grads,
scale):
a,
b):
for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
if model is None and stashed is None:
continue
......@@ -141,7 +143,8 @@ class LossScaler(object):
self._has_overflow = axpby_check_overflow_python(model,
stashed,
master,
1./scale,
a,
b,
self.dynamic)
if self._has_overflow and self.dynamic:
break
......@@ -149,11 +152,14 @@ class LossScaler(object):
def unscale_with_stashed(self,
model_grads,
stashed_master_grads,
master_grads):
master_grads,
scale_override=None):
if self._has_overflow:
return
scale = self._loss_scale
grads_have_scale, stashed_have_scale, out_scale = self._loss_scale, 1.0, 1.0
if scale_override is not None:
grads_have_scale, stashed_have_scale, out_scale = scale_override
if LossScaler.has_fused_kernel:
if (not LossScaler.warned_unscaling_non_fp32_grad
......@@ -167,14 +173,15 @@ class LossScaler(object):
multi_tensor_applier(LossScaler.multi_tensor_axpby_cuda,
self._overflow_buf,
[model_grads, stashed_master_grads, master_grads],
1./scale,
1.0,
out_scale/grads_have_scale, # 1./scale,
out_scale/stashed_have_scale, # 1.0,
0) # check only arg 0, aka the incoming model grads, for infs
else:
self.unscale_with_stashed_python(model_grads,
stashed_master_grads,
master_grads,
scale)
out_scale/grads_have_scale,
out_scale/stashed_have_scale)
# Defer to update_scale
# If the fused kernel is available, we only need one D2H memcopy and sync.
......
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include "THC/THC.h"
#include "batch_norm.h"
#include <cuda.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
static size_t round_up_to_multiple(size_t x, int multiple) {
return ((x + multiple - 1) / multiple) * multiple;
}
// TODO: Stop manually allocating CUDA memory; allocate an ATen byte
// tensor instead.
struct Workspace {
Workspace(size_t size) : size(size), data(NULL) {
data = THCudaMalloc(at::globalContext().lazyInitCUDA(), size);
}
Workspace(const Workspace&) = delete;
Workspace(Workspace&&) = default;
Workspace& operator=(Workspace&&) = default;
~Workspace() {
if (data) {
THCudaFree(at::globalContext().lazyInitCUDA(), data);
}
}
size_t size;
void* data;
};
// Return {y}
at::Tensor nhwc_bn_fwd_train(
const at::Tensor& x,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& minibatch_mean,
const at::Tensor& minibatch_inv_var,
const at::Tensor& ret_cta,
const float momentum,
const float epsilon,
const bool fuse_relu,
void * my_data,
void * pair_data,
void * pair_data2,
void * pair_data3,
const int bn_group,
const at::Tensor& magic_tensor,
const int occupancy,
const int grid_dim_x,
const bool coop) {
const int N = x.size(0);
const int H = x.size(1);
const int W = x.size(2);
const int C = x.size(3);
// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
*magic = (*magic + 1) & 0xff;
// Allocate output tensor
at::Tensor y = at::empty({N, H, W, C}, x.options());
// Create wrapper
NhwcBatchNorm *bn = new NhwcBatchNorm();
bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
nullptr,
y.data<at::Half>(),
nullptr);
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t total_workspace_bytes = 0;
std::vector<size_t> workspace_offsets;
for (auto index = 3; index < workspace_bytes.size(); ++index) {
total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
workspace_offsets.push_back(total_workspace_bytes);
auto alloc_bytes = workspace_bytes[index];
total_workspace_bytes += alloc_bytes;
}
// Allocate the workspace
Workspace ws(total_workspace_bytes);
std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[2];
void* retired_ctas = ret_cta.data<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);
for (auto index = 3; index < workspace_bytes.size(); ++index) {
void *ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index-3];
workspace.push_back(ptr);
}
bn->setWorkspacePointers(workspace, workspace_bytes);
// Don't fuse in ReLU for now at least
bn->fwd(stream, fuse_relu, my_data, pair_data, pair_data2, pair_data3, bn_group, *magic, occupancy, grid_dim_x, coop);
return y;
}
at::Tensor nhwc_bn_fwd_eval(
const at::Tensor& x,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& ret_cta,
const int bn_group,
const float momentum,
const float epsilon,
const bool fuse_relu) {
const int N = x.size(0);
const int H = x.size(1);
const int W = x.size(2);
const int C = x.size(3);
// Allocate output tensor
at::Tensor y = at::empty({N, H, W, C}, x.options());
// Create wrapper
NhwcBatchNorm *bn = new NhwcBatchNorm();
bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
nullptr,
y.data<at::Half>(),
nullptr);
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t total_workspace_bytes = 0;
std::vector<size_t> workspace_offsets;
for (auto index = 3; index < workspace_bytes.size(); ++index) {
total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
workspace_offsets.push_back(total_workspace_bytes);
auto alloc_bytes = workspace_bytes[index];
total_workspace_bytes += alloc_bytes;
}
// Allocate the workspace
Workspace ws(total_workspace_bytes);
std::vector<void *> workspace;
workspace.push_back(nullptr);
workspace.push_back(nullptr);
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[2];
void* retired_ctas = ret_cta.data<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);
for (auto index = 3; index < workspace_bytes.size(); ++index) {
void *ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index-3];
workspace.push_back(ptr);
}
bn->setWorkspacePointers(workspace, workspace_bytes);
// Don't fuse in ReLU for now at least
bn->fwdInference(stream, fuse_relu);
return y;
}
std::vector<at::Tensor> nhwc_bn_bwd(
const at::Tensor& x,
const at::Tensor& dy,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& minibatch_mean,
const at::Tensor& minibatch_inv_var,
const at::Tensor& ret_cta,
const float momentum,
const float epsilon,
const bool fuse_relu,
void * my_data,
void * pair_data,
void * pair_data2,
void * pair_data3,
const int bn_group,
const at::Tensor& magic_tensor,
const int occupancy,
const int grid_dim_x,
const bool coop) {
// shape
const int N = x.size(0);
const int H = x.size(1);
const int W = x.size(2);
const int C = x.size(3);
// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
*magic = (*magic + 1) & 0xff;
// outputs
at::Tensor x_grad, scale_grad, bias_grad;
// Allocate outputs
x_grad = at::empty_like(x);
scale_grad = at::empty_like(scale);
bias_grad = at::empty_like(bias);
// Create wrapper
NhwcBatchNorm *bn = new NhwcBatchNorm();
bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
x_grad.data<at::Half>(),
nullptr,
dy.data<at::Half>());
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {scale_grad.data<float>(), bias_grad.data<float>()});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t total_workspace_bytes = 0;
std::vector<size_t> workspace_offsets;
for (auto index = 3; index < workspace_bytes.size(); ++index) {
total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
workspace_offsets.push_back(total_workspace_bytes);
auto alloc_bytes = workspace_bytes[index];
total_workspace_bytes += alloc_bytes;
}
// Allocate the workspace
Workspace ws(total_workspace_bytes);
std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[2];
void* retired_ctas = ret_cta.data<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);
for (auto index = 3; index < workspace_bytes.size(); ++index) {
void *ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index-3];
workspace.push_back(ptr);
}
bn->setWorkspacePointers(workspace, workspace_bytes);
bn->dgrad(stream, fuse_relu, my_data, pair_data, pair_data2, pair_data3, bn_group, *magic, occupancy, grid_dim_x, coop);
return std::vector<at::Tensor>{x_grad, scale_grad, bias_grad};
}
int nhwc_bn_fwd_occupancy() {
int device_id=-1;
cudaGetDevice(&device_id);
//max occupancy supported by the code is 2
return NhwcBatchNorm::smem_driven_fwd_occupancy(device_id, 2);
}
int nhwc_bn_bwd_occupancy() {
int device_id=-1;
cudaGetDevice(&device_id);
//max occupancy supported by the code is 2
return NhwcBatchNorm::smem_driven_bwd_occupancy(device_id, 2);
}
This diff is collapsed.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include "THC/THC.h"
#include "batch_norm_add_relu.h"
#include <cuda.h>
//FIXME move the common stuff to common h file
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
static size_t round_up_to_multiple(size_t x, int multiple) {
return ((x + multiple - 1) / multiple) * multiple;
}
// TODO: Stop manually allocating CUDA memory; allocate an ATen byte
// tensor instead.
struct Workspace {
Workspace(size_t size) : size(size), data(NULL) {
data = THCudaMalloc(at::globalContext().lazyInitCUDA(), size);
}
Workspace(const Workspace&) = delete;
Workspace(Workspace&&) = default;
Workspace& operator=(Workspace&&) = default;
~Workspace() {
if (data) {
THCudaFree(at::globalContext().lazyInitCUDA(), data);
}
}
size_t size;
void* data;
};
// Return {y}
at::Tensor nhwc_bn_addrelu_fwd_train(
const at::Tensor& x,
const at::Tensor& z,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& minibatch_mean,
const at::Tensor& minibatch_inv_var,
const at::Tensor& bitmask,
const at::Tensor& ret_cta,
const float momentum,
const float epsilon,
void * my_data,
void * pair_data,
void * pair_data2,
void * pair_data3,
const int bn_group,
const at::Tensor& magic_tensor,
const int occupancy,
const int grid_dim_x,
const bool coop) {
const int N = x.size(0);
const int H = x.size(1);
const int W = x.size(2);
const int C = x.size(3);
// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
*magic = (*magic + 1) & 0xff;
// Allocate output tensor
at::Tensor y = at::empty({N, H, W, C}, x.options());
// Create wrapper
NhwcBatchNormAddRelu *bn = new NhwcBatchNormAddRelu();
bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
nullptr,
y.data<at::Half>(),
nullptr,
z.data<at::Half>(),
nullptr);
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t total_workspace_bytes = 0;
std::vector<size_t> workspace_offsets;
for (auto index = 4; index < workspace_bytes.size(); ++index) {
total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
workspace_offsets.push_back(total_workspace_bytes);
auto alloc_bytes = workspace_bytes[index];
total_workspace_bytes += alloc_bytes;
}
// Allocate the workspace
Workspace ws(total_workspace_bytes);
std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
workspace.push_back(bitmask.data<int32_t>());
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[3];
void* retired_ctas = ret_cta.data<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);
for (auto index = 4; index < workspace_bytes.size(); ++index) {
void *ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index-4];
workspace.push_back(ptr);
}
bn->setWorkspacePointers(workspace, workspace_bytes);
// Don't fuse in ReLU for now at least
bn->fwd(stream, my_data, pair_data, pair_data2, pair_data3, bn_group, *magic, occupancy, grid_dim_x, coop);
return y;
}
at::Tensor nhwc_bn_addrelu_fwd_eval(
const at::Tensor& x,
const at::Tensor& z,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& ret_cta,
const int bn_group,
const float momentum,
const float epsilon) {
const int N = x.size(0);
const int H = x.size(1);
const int W = x.size(2);
const int C = x.size(3);
// Allocate output tensor
at::Tensor y = at::empty({N, H, W, C}, x.options());
// Create wrapper
NhwcBatchNormAddRelu *bn = new NhwcBatchNormAddRelu();
bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
nullptr,
y.data<at::Half>(),
nullptr,
z.data<at::Half>(),
nullptr);
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t total_workspace_bytes = 0;
std::vector<size_t> workspace_offsets;
for (auto index = 4; index < workspace_bytes.size(); ++index) {
total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
workspace_offsets.push_back(total_workspace_bytes);
auto alloc_bytes = workspace_bytes[index];
total_workspace_bytes += alloc_bytes;
}
// Allocate the workspace
Workspace ws(total_workspace_bytes);
std::vector<void *> workspace;
workspace.push_back(nullptr);
workspace.push_back(nullptr);
workspace.push_back(nullptr);
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[3];
void* retired_ctas = ret_cta.data<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);
for (auto index = 4; index < workspace_bytes.size(); ++index) {
void *ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index-4];
workspace.push_back(ptr);
}
bn->setWorkspacePointers(workspace, workspace_bytes);
// Don't fuse in ReLU for now at least
bn->fwdInference(stream);
return y;
}
std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
const at::Tensor& x,
const at::Tensor& dy,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& minibatch_mean,
const at::Tensor& minibatch_inv_var,
const at::Tensor& bitmask,
const at::Tensor& ret_cta,
const float momentum,
const float epsilon,
void * my_data,
void * pair_data,
void * pair_data2,
void * pair_data3,
const int bn_group,
const at::Tensor& magic_tensor,
const int occupancy,
const int grid_dim_x,
const bool coop) {
// shape
const int N = x.size(0);
const int H = x.size(1);
const int W = x.size(2);
const int C = x.size(3);
// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
*magic = (*magic + 1) & 0xff;
// outputs
at::Tensor x_grad, z_grad, scale_grad, bias_grad;
// Allocate outputs
x_grad = at::empty_like(x);
z_grad = at::empty_like(x);
scale_grad = at::empty_like(scale);
bias_grad = at::empty_like(bias);
// Create wrapper
NhwcBatchNormAddRelu *bn = new NhwcBatchNormAddRelu();
bn->setInputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W, bn_group);
bn->setOutputDescriptor(CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, N, C, H, W);
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
x_grad.data<at::Half>(),
nullptr,
dy.data<at::Half>(),
nullptr,
z_grad.data<at::Half>());
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {scale_grad.data<float>(), bias_grad.data<float>()});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t total_workspace_bytes = 0;
std::vector<size_t> workspace_offsets;
for (auto index = 4; index < workspace_bytes.size(); ++index) {
total_workspace_bytes = round_up_to_multiple(total_workspace_bytes, 512);
workspace_offsets.push_back(total_workspace_bytes);
auto alloc_bytes = workspace_bytes[index];
total_workspace_bytes += alloc_bytes;
}
// Allocate the workspace
Workspace ws(total_workspace_bytes);
std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
workspace.push_back(bitmask.data<int32_t>());
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[3];
void* retired_ctas = ret_cta.data<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);
for (auto index = 4; index < workspace_bytes.size(); ++index) {
void *ptr = reinterpret_cast<uint8_t*>(ws.data) + workspace_offsets[index-4];
workspace.push_back(ptr);
}
bn->setWorkspacePointers(workspace, workspace_bytes);
bn->dgrad(stream, my_data, pair_data, pair_data2, pair_data3, bn_group, *magic, occupancy, grid_dim_x, coop);
return std::vector<at::Tensor>{x_grad, z_grad, scale_grad, bias_grad};
}
int nhwc_bn_addrelu_fwd_occupancy() {
int device_id=-1;
cudaGetDevice(&device_id);
//max occupancy supported by the code is 2
return NhwcBatchNormAddRelu::smem_driven_fwd_occupancy(device_id, 2);
}
int nhwc_bn_addrelu_bwd_occupancy() {
int device_id=-1;
cudaGetDevice(&device_id);
//max occupancy supported by the code is 2
return NhwcBatchNormAddRelu::smem_driven_bwd_occupancy(device_id, 2);
}
This diff is collapsed.
#include <ATen/cuda/CUDAContext.h>
#ifndef CUDA_UTILS_H
#define CUDA_UTILS_H
namespace at {
namespace cuda {
namespace utils {
static inline int MaxSharedMemoryPerMultiprocessor(int device_id) {
return getDeviceProperties(device_id)->sharedMemPerMultiprocessor;
}
}
}
}
#endif
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/ArrayRef.h>
#include <ATen/ScalarType.h>
#include "ATen/Scalar.h"
#ifndef VERSION_GE_1_1
#include "ATen/Type.h"
#endif
#include "ATen/Tensor.h"
#include "ATen/Storage.h"
#include "ATen/Generator.h"
namespace py = pybind11;
int64_t get_buffer_size(
const int bn_sync_steps);
void* get_data_ptr(
const at::Tensor& data);
void* get_remote_data_ptr(
const at::Tensor& handle,
const int64_t offset);
void close_remote_data(
const at::Tensor& handle);
at::Tensor nhwc_bn_fwd_train(
const at::Tensor& x,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& minibatch_mean,
const at::Tensor& minibatch_inv_var,
const at::Tensor& ret_cta,
const float momentum,
const float epsilon,
const bool fuse_relu,
void* my_data,
void* pair_data,
void* pair_data2,
void* pair_data3,
const int bn_group,
const at::Tensor& magic_tensor,
const int occupancy,
const int grid_dim_x,
const bool coop);
at::Tensor nhwc_bn_fwd_eval(
const at::Tensor& x,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& ret_cta,
const int bn_group,
const float momentum,
const float epsilon,
const bool fuse_relu);
std::vector<at::Tensor> nhwc_bn_bwd(
const at::Tensor& x,
const at::Tensor& dy,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& minibatch_mean,
const at::Tensor& minibatch_inv_var,
const at::Tensor& ret_cta,
const float momentum,
const float epsilon,
const bool fuse_relu,
void* my_data,
void* pair_data,
void* pair_data2,
void* pair_data3,
const int bn_group,
const at::Tensor& magic_tensor,
const int occupancy,
const int grid_dim_x,
const bool coop);
at::Tensor nhwc_bn_addrelu_fwd_train(
const at::Tensor& x,
const at::Tensor& z,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& minibatch_mean,
const at::Tensor& minibatch_inv_var,
const at::Tensor& bitmask,
const at::Tensor& ret_cta,
const float momentum,
const float epsilon,
void* my_data,
void* pair_data,
void* pair_data2,
void* pair_data3,
const int bn_group,
const at::Tensor& magic_tensor,
const int occupancy,
const int grid_dim_x,
const bool coop);
at::Tensor nhwc_bn_addrelu_fwd_eval(
const at::Tensor& x,
const at::Tensor& z,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& ret_cta,
const int bn_group,
const float momentum,
const float epsilon);
std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
const at::Tensor& x,
const at::Tensor& dy,
const at::Tensor& scale,
const at::Tensor& bias,
const at::Tensor& running_mean,
const at::Tensor& running_inv_var,
const at::Tensor& minibatch_mean,
const at::Tensor& minibatch_inv_var,
const at::Tensor& bitmask,
const at::Tensor& ret_cta,
const float momentum,
const float epsilon,
void* my_data,
void* pair_data,
void* pair_data2,
void* pair_data3,
const int bn_group,
const at::Tensor& magic_tensor,
const int occupancy,
const int grid_dim_x,
const bool coop);
int nhwc_bn_fwd_occupancy();
int nhwc_bn_bwd_occupancy();
int nhwc_bn_addrelu_fwd_occupancy();
int nhwc_bn_addrelu_bwd_occupancy();
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("get_buffer_size", &get_buffer_size, "get_buffer_size");
m.def("get_data_ptr", &get_data_ptr, "get_data_ptr");
m.def("get_remote_data_ptr", &get_remote_data_ptr, "get_remote_data_ptr");
m.def("close_remote_data", &close_remote_data, "close_remote_data");
m.def("bn_fwd_nhwc", &nhwc_bn_fwd_train, "bn_fwd_nhwc");
m.def("bn_fwd_eval_nhwc", &nhwc_bn_fwd_eval, "bn_fwd_eval_nhwc");
m.def("bn_bwd_nhwc", &nhwc_bn_bwd, "bn_bwd_nhwc");
m.def("bn_fwd_nhwc_occupancy", &nhwc_bn_fwd_occupancy, "bn_fwd_nhwc_occupancy");
m.def("bn_bwd_nhwc_occupancy", &nhwc_bn_bwd_occupancy, "bn_bwd_nhwc_occupancy");
m.def("bn_addrelu_fwd_nhwc", &nhwc_bn_addrelu_fwd_train, "bn_addrelu_fwd_nhwc");
m.def("bn_addrelu_fwd_eval_nhwc", &nhwc_bn_addrelu_fwd_eval, "bn_addrelu_fwd_eval_nhwc");
m.def("bn_addrelu_bwd_nhwc", &nhwc_bn_addrelu_bwd, "bn_addrelu_bwd_nhwc");
m.def("bn_addrelu_fwd_nhwc_occupancy", &nhwc_bn_addrelu_fwd_occupancy, "bn_addrelu_fwd_nhwc_occupancy");
m.def("bn_addrelu_bwd_nhwc_occupancy", &nhwc_bn_addrelu_bwd_occupancy, "bn_addrelu_bwd_nhwc_occupancy");
}
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include "THC/THC.h"
#include <cuda.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template<>
struct std::hash<cudaIpcMemHandle_t> {
size_t operator() (const cudaIpcMemHandle_t& handle) const {
size_t hash = 0;
uint8_t* ptr = (uint8_t*)&handle;
assert(sizeof(uint8_t) == 1);
for (int i=0; i<sizeof(cudaIpcMemHandle_t); i++) {
hash += *ptr;
ptr++;
}
return hash;
}
};
template<>
struct std::equal_to<cudaIpcMemHandle_t> {
bool operator() (const cudaIpcMemHandle_t &lhs,
const cudaIpcMemHandle_t &rhs) const {
return (std::memcmp((void*) &lhs,
(void*) &rhs,
sizeof(cudaIpcMemHandle_t)) == 0);
}
};
namespace {
namespace gpuipc {
//from: src/operator/nn/cudnn/nhwc_batch_norm_kernel.h
// The number of threads per pixel.
const int THREADS_PER_PIXEL = 16;
// The number of elements per ldg.
const int ELEMENTS_PER_LDG = 4;
// The number of reducing ops, each uses its own space : mean, var, dscale, dbias
const int REDUCE_OPS = 4;
// Maximum block.y supported - limited due to buffer allocation
const int MAX_BLOCK_Y = 256;
const int MAX_OFFSET = REDUCE_OPS*MAX_BLOCK_Y;
const int BYTES_PER_ELEM = 4;
// Buffer size per sync step
const int SINGLE_SYNC_BUFFER_BYTES = MAX_OFFSET*THREADS_PER_PIXEL*2*ELEMENTS_PER_LDG*BYTES_PER_ELEM;
};
class IpcMemHandleRegistry {
public:
void* getPtr(const cudaIpcMemHandle_t& handle, int64_t offset) {
if (registry_.count(handle) == 0) {
registry_.insert(std::make_pair(handle, RegistryEntry()));
registry_[handle].dev_ptr = ipcOpenMem(handle);
}
registry_[handle].ref_count++;
return (((uint8_t*)registry_[handle].dev_ptr) + offset);
}
void releasePtr(const cudaIpcMemHandle_t& handle) {
if (registry_.count(handle) == 0) {
}
if (--registry_[handle].ref_count == 0) {
ipcCloseMem(registry_[handle].dev_ptr);
registry_.erase(handle);
}
}
struct RegistryEntry {
void* dev_ptr;
int ref_count;
RegistryEntry() : dev_ptr(NULL) , ref_count(0) {}
};
protected:
std::unordered_map<cudaIpcMemHandle_t, RegistryEntry> registry_;
void* ipcOpenMem(const cudaIpcMemHandle_t& handle) {
void *data;
cudaIpcOpenMemHandle(&data, handle, cudaIpcMemLazyEnablePeerAccess);
cudaCheckErrors("ipc init");
return data;
}
void ipcCloseMem(void* dev_ptr) {
cudaIpcCloseMemHandle(dev_ptr);
cudaCheckErrors("ipc close");
}
};
}
static IpcMemHandleRegistry ipc_mem_registry;
int64_t get_buffer_size(const int bn_sync_steps) {
return bn_sync_steps * gpuipc::SINGLE_SYNC_BUFFER_BYTES;
}
void* get_remote_data_ptr(const at::Tensor& handle, const int64_t offset) {
cudaIpcMemHandle_t my_handle;
memcpy((unsigned char *)(&my_handle), handle.data<uint8_t>(), sizeof(my_handle));
return ipc_mem_registry.getPtr(my_handle, offset);
}
void close_remote_data(const at::Tensor& handle) {
cudaIpcMemHandle_t my_handle;
memcpy((unsigned char *)(&my_handle), handle.data<uint8_t>(), sizeof(my_handle));
ipc_mem_registry.releasePtr(my_handle);
}
void* get_data_ptr(
const at::Tensor& data) {
return data.data<uint8_t>();
}
This diff is collapsed.
#include <torch/extension.h>
// CUDA forward declarations
std::vector<at::Tensor> softmax_xentropy_cuda(
const at::Tensor &input,
const at::Tensor &labels,
const float smoothing,
const bool half_to_float);
at::Tensor softmax_xentropy_backward_cuda(
const at::Tensor &grad_loss,
const at::Tensor &logits,
const at::Tensor &max_log_sum_exp,
const at::Tensor &labels,
const float smoothing);
// C++ interface
#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
std::vector<at::Tensor> softmax_xentropy_forward(
const at::Tensor &input,
const at::Tensor &labels,
const float smoothing,
const bool half_to_float) {
CHECK_CUDA(input);
CHECK_INPUT(labels);
return softmax_xentropy_cuda(input, labels, smoothing, half_to_float);
}
at::Tensor softmax_xentropy_backward(
const at::Tensor &grad_loss,
const at::Tensor &logits,
const at::Tensor &max_log_sum_exp,
const at::Tensor &labels,
const float smoothing) {
CHECK_CUDA(grad_loss);
CHECK_CUDA(logits);
CHECK_INPUT(max_log_sum_exp);
CHECK_INPUT(labels);
return softmax_xentropy_backward_cuda(grad_loss, logits, max_log_sum_exp, labels, smoothing);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &softmax_xentropy_forward, "Softmax cross entropy loss with label smoothing forward (CUDA)");
m.def("backward", &softmax_xentropy_backward, "Softmax cross entropy loss with label smoothing backward (CUDA)");
}
This diff is collapsed.
try:
import torch
import bnp
from .batch_norm import BatchNorm2d_NHWC
del torch
del bnp
del batch_norm
except ImportError as err:
print("apex was installed without --bnp flag, contrib.groupbn is not available")
import torch
import numpy as np
from torch.nn.modules.batchnorm import _BatchNorm
import bnp
class bn_NHWC_impl(torch.autograd.Function):
@staticmethod
def forward(ctx, x, s, b, rm, riv, mini_m, mini_riv, ret_cta, mom, epsilon, fuse_relu, is_train, bn_group, my_data, pair_data, magic, pair_data2, pair_data3, fwd_occup, fwd_grid_x, bwd_occup, bwd_grid_x, multi_stream):
if is_train:
ctx.save_for_backward(x, s, b, rm, riv, mini_m, mini_riv)
ctx.epsilon = epsilon
ctx.momentum = mom
ctx.ret_cta = ret_cta
ctx.fuse_relu = fuse_relu
ctx.my_data = my_data
ctx.pair_data = pair_data
ctx.magic = magic
ctx.pair_data2 = pair_data2
ctx.pair_data3 = pair_data3
ctx.bn_group = bn_group
ctx.bwd_occup = bwd_occup
ctx.bwd_grid_x = bwd_grid_x
ctx.multi_stream = multi_stream
res = bnp.bn_fwd_nhwc(x, s, b, rm, riv, mini_m, mini_riv, ret_cta, mom, epsilon, fuse_relu, my_data, pair_data, pair_data2, pair_data3, bn_group, magic, fwd_occup, fwd_grid_x, multi_stream)
return res
else:
return bnp.bn_fwd_eval_nhwc(x, s, b, rm, riv, ret_cta, bn_group, mom, epsilon, fuse_relu)
@staticmethod
def backward(ctx, grad_y):
x, s, b, rm, riv, mini_m, mini_riv = ctx.saved_variables
epsilon = ctx.epsilon
mom = ctx.momentum
ret_cta = ctx.ret_cta
fuse_relu = ctx.fuse_relu
my_data = ctx.my_data
pair_data = ctx.pair_data
magic = ctx.magic
pair_data2 = ctx.pair_data2
pair_data3 = ctx.pair_data3
bn_group = ctx.bn_group
bwd_occup = ctx.bwd_occup
bwd_grid_x = ctx.bwd_grid_x
multi_stream = ctx.multi_stream
dx, dscale, dbias = bnp.bn_bwd_nhwc(x, grad_y, s, b, rm, riv, mini_m, mini_riv, ret_cta, mom, epsilon, fuse_relu, my_data, pair_data, pair_data2, pair_data3, bn_group, magic, bwd_occup, bwd_grid_x, multi_stream)
return dx, dscale, dbias, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
class bn_addrelu_NHWC_impl(torch.autograd.Function):
@staticmethod
def forward(ctx, x, z, s, b, rm, riv, mini_m, mini_riv, grid_dim_y, ret_cta, mom, epsilon, is_train, bn_group, my_data, pair_data, magic, pair_data2, pair_data3, fwd_occup, fwd_grid_x, bwd_occup, bwd_grid_x, multi_stream):
if is_train:
bitmask = torch.cuda.IntTensor(((x.numel()+31)//32) * 2 * grid_dim_y)
ctx.save_for_backward(x, s, b, rm, riv, mini_m, mini_riv, bitmask)
ctx.epsilon = epsilon
ctx.momentum = mom
ctx.ret_cta = ret_cta
ctx.my_data = my_data
ctx.pair_data = pair_data
ctx.magic = magic
ctx.pair_data2 = pair_data2
ctx.pair_data3 = pair_data3
ctx.bn_group = bn_group
ctx.bwd_occup = bwd_occup
ctx.bwd_grid_x = bwd_grid_x
ctx.multi_stream = multi_stream
res = bnp.bn_addrelu_fwd_nhwc(x, z, s, b, rm, riv, mini_m, mini_riv, bitmask, ret_cta, mom, epsilon, my_data, pair_data, pair_data2, pair_data3, bn_group, magic, fwd_occup, fwd_grid_x, multi_stream)
return res
else:
return bnp.bn_addrelu_fwd_eval_nhwc(x, z, s, b, rm, riv, ret_cta, bn_group, mom, epsilon)
@staticmethod
def backward(ctx, grad_y):
x, s, b, rm, riv, mini_m, mini_riv, bitmask = ctx.saved_variables
epsilon = ctx.epsilon
mom = ctx.momentum
ret_cta = ctx.ret_cta
my_data = ctx.my_data
pair_data = ctx.pair_data
magic = ctx.magic
pair_data2 = ctx.pair_data2
pair_data3 = ctx.pair_data3
bn_group = ctx.bn_group
bwd_occup = ctx.bwd_occup
bwd_grid_x = ctx.bwd_grid_x
multi_stream = ctx.multi_stream
dx, dz, dscale, dbias = bnp.bn_addrelu_bwd_nhwc(x, grad_y, s, b, rm, riv, mini_m, mini_riv, bitmask, ret_cta, mom, epsilon, my_data, pair_data, pair_data2, pair_data3, bn_group, magic, bwd_occup, bwd_grid_x, multi_stream)
return dx, dz, dscale, dbias, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
class BatchNorm2d_NHWC(_BatchNorm):
# if using BatchNorm2d_NHWC simultaneously with multiple streams set multi_stream to True
def __init__(self, num_features, fuse_relu=False, bn_group=1, max_cta_per_sm=2, cta_launch_margin=12, multi_stream=False):
super(BatchNorm2d_NHWC, self).__init__(num_features)
self.fuse_relu = fuse_relu
self.multi_stream = multi_stream
self.minibatch_mean = torch.cuda.FloatTensor(num_features)
self.minibatch_riv = torch.cuda.FloatTensor(num_features)
#defaut to distributed bn disabled
self.bn_group = bn_group
self.max_cta_per_sm = max_cta_per_sm #used only in training fwd and bwd
self.cta_launch_margin = cta_launch_margin #used only in training fwd and bwd
self.my_data = None
self.pair_data = None
self.pair_data2 = None
self.pair_data3 = None
self.local_rank = 0
self.magic = torch.IntTensor([0])
#calculate cta per sm occupancies
assert(max_cta_per_sm>0) # won't be able to do much with 0 CTAs :)
self.fwd_occupancy = min(bnp.bn_fwd_nhwc_occupancy(), max_cta_per_sm)
self.bwd_occupancy = min(bnp.bn_bwd_nhwc_occupancy(), max_cta_per_sm)
self.addrelu_fwd_occupancy = min(bnp.bn_addrelu_fwd_nhwc_occupancy(), max_cta_per_sm)
self.addrelu_bwd_occupancy = min(bnp.bn_addrelu_bwd_nhwc_occupancy(), max_cta_per_sm)
#calculate grid dimentions based on occupancy numbers
mp_count = torch.cuda.get_device_properties(None).multi_processor_count
self.fwd_grid_dim_x = max(mp_count*self.fwd_occupancy - cta_launch_margin , 1)
self.bwd_grid_dim_x = max(mp_count*self.bwd_occupancy - cta_launch_margin , 1)
self.addrelu_fwd_grid_dim_x = max(mp_count*self.addrelu_fwd_occupancy - cta_launch_margin , 1)
self.addrelu_bwd_grid_dim_x = max(mp_count*self.addrelu_bwd_occupancy - cta_launch_margin , 1)
self.grid_dim_y = (num_features + 63) // 64
# allocate scratch space used by implementation
# TODO: scratch space that is not supposed to be exposed at user code. We only need one time initialization, the
# same buffer could be reused in future iterations. Currently we exposed it here instead of requesting new
# buffer from cache allocator to avoid unnecessary initialization at future iterations.
self.ret_cta = torch.cuda.ByteTensor(8192).fill_(0)
#FIXME: turn pair handles into an array
if bn_group>1:
local_rank = torch.distributed.get_rank()
world_size = torch.distributed.get_world_size()
assert(world_size >= bn_group)
assert(world_size % bn_group == 0)
bn_sync_steps = 1
if (bn_group==4):
bn_sync_steps = 2
if (bn_group==8):
bn_sync_steps = 3
self.ipc_buffer = torch.cuda.ByteTensor(bnp.get_buffer_size(bn_sync_steps))
self.my_data = bnp.get_data_ptr(self.ipc_buffer)
# we are walking on very thin ice here by utilizing internal `_share_cuda_()`
self.storage = self.ipc_buffer.storage()
self.share_cuda = self.storage._share_cuda_()
internal_cuda_mem = self.share_cuda
# internal_cuda_mem[1]: ipc_mem_handle
my_handle = torch.cuda.ByteTensor(np.frombuffer(internal_cuda_mem[1], dtype=np.uint8))
# internal_cuda_mem[3]: offset
my_offset = torch.cuda.IntTensor([internal_cuda_mem[3]])
handles_all = torch.empty(world_size, my_handle.size(0), dtype=my_handle.dtype, device=my_handle.device)
handles_l = list(handles_all.unbind(0))
torch.distributed.all_gather(handles_l, my_handle)
offsets_all = torch.empty(world_size, my_offset.size(0), dtype=my_offset.dtype, device=my_offset.device)
offsets_l = list(offsets_all.unbind(0))
torch.distributed.all_gather(offsets_l, my_offset)
#whom do I actually care about? that would be local_rank XOR 1
self.pair_handle = handles_l[local_rank ^ 1].cpu().contiguous()
pair_offset = offsets_l[local_rank ^ 1].cpu()
self.pair_data = bnp.get_remote_data_ptr(self.pair_handle, pair_offset)
if bn_group>2:
self.pair_handle2 = handles_l[local_rank ^ 2].cpu().contiguous()
pair_offset2 = offsets_l[local_rank ^ 2].cpu()
self.pair_data2 = bnp.get_remote_data_ptr(self.pair_handle2, pair_offset2)
if bn_group>4:
self.pair_handle3 = handles_l[local_rank ^ 4].cpu().contiguous()
pair_offset3 = offsets_l[local_rank ^ 4].cpu()
self.pair_data3 = bnp.get_remote_data_ptr(self.pair_handle3, pair_offset3)
#FIXME: get magic value into C code and eliminate from here
self.magic = torch.IntTensor([2])
self.local_rank = local_rank
def forward(self, x, z=None):
if z is not None:
assert(self.fuse_relu==True)
return bn_addrelu_NHWC_impl.apply(x, z,
self.weight, self.bias,
self.running_mean, self.running_var,
self.minibatch_mean, self.minibatch_riv, self.grid_dim_y, self.ret_cta,
self.momentum,
self.eps, self.training, self.bn_group, self.my_data, self.pair_data, (self.magic), self.pair_data2, self.pair_data3,
self.addrelu_fwd_occupancy, self.addrelu_fwd_grid_dim_x,
self.addrelu_bwd_occupancy, self.addrelu_bwd_grid_dim_x,
self.multi_stream)
else:
return bn_NHWC_impl.apply(x,
self.weight, self.bias,
self.running_mean, self.running_var,
self.minibatch_mean, self.minibatch_riv, self.ret_cta,
self.momentum,
self.eps, self.fuse_relu, self.training, self.bn_group, self.my_data, self.pair_data, (self.magic), self.pair_data2, self.pair_data3,
self.fwd_occupancy, self.fwd_grid_dim_x,
self.bwd_occupancy, self.bwd_grid_dim_x,
self.multi_stream)
def __del__(self):
if self.bn_group>1:
bnp.close_remote_data(self.pair_handle)
if self.bn_group>2:
bnp.close_remote_data(self.pair_handle2)
if self.bn_group>4:
bnp.close_remote_data(self.pair_handle3)
import torch
from apex.contrib import xentropy as label_smoothing
import unittest
import warnings
import random
import numpy as np
import time
def label_smoothing_raw(x, target, padding_idx, smoothing):
logprobs = torch.nn.functional.log_softmax(x, dim=-1, dtype=torch.float32)
non_pad_mask = (target != padding_idx)
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
nll_loss = nll_loss.squeeze(1)[non_pad_mask]
smooth_loss = -logprobs.mean(dim=-1)[non_pad_mask]
loss = (1.0 - smoothing) * nll_loss + smoothing * smooth_loss
return loss
def label_smoothing_opt_1(x, target, padding_idx, smoothing):
logprobs = torch.nn.functional.log_softmax(x, dim=-1, dtype=torch.float32)
pad_mask = (target == padding_idx)
ll_loss = logprobs.gather(dim=-1, index=target.unsqueeze(1)).squeeze(1)
smooth_loss = logprobs.mean(dim=-1)
loss = (smoothing - 1.0) * ll_loss - smoothing * smooth_loss
loss.masked_fill_(pad_mask, 0)
return loss
class LabelSmoothingTest(unittest.TestCase):
def setUp(self, seed=1234):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# Set pytorch print precision
torch.set_printoptions(precision=10)
def gen_test_inputs(self, N, T, H, smoothing, padding_idx):
logits = torch.randn((N*T, H), dtype=torch.half, device='cuda',
requires_grad=True)
labels = torch.randint(0, H, [N*T], device='cuda')
for i in random.sample(range(N*T), N*T//6):
labels[i] = padding_idx
half_to_float = (logits.dtype == torch.half)
return logits, labels, half_to_float
def print_max_diff_elem(self, ref, tst):
ref, tst = ref.flatten(), tst.flatten()
diff = (ref - tst).abs().max()
idx = (ref - tst).abs().argmax()
print("Max atol idx: {}, diff: {:.6f}, ref: {:.6f}, tst: {:.6f}".format(
idx, diff, ref[idx], tst[idx]))
def test_label_smoothing_function(self):
# Set label smoothing configuration
smoothing, padding_idx = 0.1, 0
N, T, H = 128, 74, 32320
iters = 10
loss_func = label_smoothing.SoftmaxCrossEntropyLoss.apply
for i in range(iters):
logits, labels, half_to_float = self.gen_test_inputs(
N, T, H, smoothing, padding_idx)
# Run original softmax cross entropy with label smoothing
logits.grad = None
losses = label_smoothing_raw(logits, labels, padding_idx, smoothing)
loss = losses.sum()
loss.backward()
ref_loss = loss.clone().detach()
ref_grad = logits.grad.clone().detach()
# Run optimized softmax cross entropy with label smoothing
logits.grad = None
losses = loss_func(logits, labels, smoothing, padding_idx, half_to_float)
loss = losses.sum()
loss.backward()
val_loss = loss.clone().detach()
val_grad = logits.grad.clone().detach()
# Validate
self.print_max_diff_elem(ref_grad, val_grad)
self.assertTrue(torch.allclose(ref_loss, val_loss, atol=1e-5, rtol=1e-5))
self.assertTrue(torch.allclose(ref_grad, val_grad, atol=1e-5, rtol=1e-5))
def test_label_smoothing_perf(self):
# Set label smoothing configuration
smoothing, padding_idx = 0.1, 0
N, T, H = 128, 74, 32320
iters = 1000
loss_func = label_smoothing.SoftmaxCrossEntropyLoss.apply
print()
logits, labels, half_to_float = self.gen_test_inputs(
N, T, H, smoothing, padding_idx)
# Run original softmax cross entropy with label smoothing
torch.cuda.synchronize()
ts = time.time()
for i in range(iters):
logits.grad = None
losses = label_smoothing_raw(logits, labels, padding_idx, smoothing)
loss = losses.sum() / N
loss.backward()
torch.cuda.synchronize()
print("Raw time {:.2f} s elapsed for {} iterations, norm {:.4f}".format(
time.time() - ts, iters, logits.grad.norm()))
# Run optimized softmax cross entropy with label smoothing
torch.cuda.synchronize()
ts = time.time()
for i in range(iters):
logits.grad = None
losses = loss_func(logits, labels, smoothing, padding_idx, half_to_float)
loss = losses.sum() / N
loss.backward()
torch.cuda.synchronize()
print("Opt time {:.2f} s elapsed for {} iterations, norm {:.4f}".format(
time.time() - ts, iters, logits.grad.norm()))
if __name__ == '__main__':
unittest.main()
try:
import torch
import xentropy_cuda
from .softmax_xentropy import SoftmaxCrossEntropyLoss
del torch
del xentropy_cuda
del softmax_xentropy
except ImportError as err:
print("apex was installed without --xentropy flag, contrib.xentropy is not available")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment