"csrc/git@developer.sourcefind.cn:OpenDAS/torch-cluster.git" did not exist on "c67425b070ec8fc2d6e4757cd74cf4b171d48902"
Commit 889d1712 authored by Michael Carilli's avatar Michael Carilli
Browse files

New API tentatively works on resnet50, ready for stress testing.

parent fad78c16
from . import fp16_utils
from . import parallel
from . import amp
from . import fp16_utils
# For optimizers and normalization there is no Python fallback.
# Absence of cuda backend is a hard error.
......
from .amp import init, half_function, float_function, promote_function,\
register_half_function, register_float_function, register_promote_function,\
register
from .multi_tensor_apply import MultiTensorApply
register_half_function, register_float_function, register_promote_function
from .handle import scale_loss
from .frontend import register
from .multi_tensor_apply import MultiTensorApply, multi_tensor_applier
# This is a "header object" that allows different amp modules to communicate.
# I'm a C++ guy, not a python guy. I decided this approach because it seemed most C++-like.
# But apparently it's ok:
# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
class AmpState(object):
pass
# Attribute stash. Could also just stash things as global module attributes.
_amp_state = AmpState()
from . import compat, rnn_compat, utils, wrap
from .handle import AmpHandle, NoOpHandle
from .lists import functional_overrides, torch_overrides, tensor_overrides
from ._amp_state import _amp_state
from .frontend import *
import functools
......@@ -170,4 +171,7 @@ def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False,
wrap.err_if_any_half(functional_overrides.MODULE, fn, handle, err_msg)
_DECORATOR_HANDLE = handle
_amp_state.handle = handle
return handle
import torch
from .initialize import initialize
from .initialize import _initialize
from ._amp_state import _amp_state
class Properties(object):
......@@ -10,6 +11,7 @@ class Properties(object):
"""
def __init__(self):
self.options = {
"enabled" : False,
"opt_level" : None,
"cast_model_type" : None,
"cast_torch_functions" : False,
......@@ -18,6 +20,7 @@ class Properties(object):
"loss_scale" : 1.0,
"flatten_model_params" : False,
"flatten_master_params" : False,
"fused_optimizer" : False,
"enable_ddp_interop" : False}
"""
......@@ -45,7 +48,7 @@ class Properties(object):
def __setattr__(self, name, value):
if "options" in self.__dict__:
if name in self.options:
print("setting {}".format(name))
print("setting {} {}".format(name, value))
self.options[name] = value
else:
super(Properties, self).__setattr__(name, value)
......@@ -63,7 +66,8 @@ class O3:
"If not, try other optimization levels."
def __call__(self, properties):
properties.opt_level = "O3",
properties.enabled = True
properties.opt_level = "O3"
properties.cast_model_type = torch.float16
properties.cast_torch_functions = False
properties.cast_batchnorm = False
......@@ -71,6 +75,7 @@ class O3:
properties.loss_scale = 1.0
properties.flatten_model_params = False
properties.flatten_master_params = False
properties.fused_optimizer = False
properties.enable_ddp_interop = False
return properties # modified in place so this isn't really necessary
......@@ -86,7 +91,8 @@ class O2:
"Master weights can also improve convergence and stability."
def __call__(self, properties):
properties.opt_level = "O2",
properties.enabled = True
properties.opt_level = "O2"
properties.cast_model_type = torch.float16
properties.cast_torch_functions = False
properties.cast_batchnorm = torch.float32
......@@ -94,6 +100,7 @@ class O2:
properties.loss_scale = 128.0
properties.flatten_model_params = False
properties.flatten_master_params = False
properties.fused_optimizer = False
properties.enable_ddp_interop = False
return properties # modified in place so this isn't really necessary
......@@ -108,7 +115,8 @@ class O1:
"trying mixed precision training for the first time."
def __call__(self, properties):
properties.opt_level = "O1",
properties.enabled = True
properties.opt_level = "O1"
properties.cast_model_type = False
properties.cast_torch_functions = True
properties.cast_batchnorm = False
......@@ -116,6 +124,7 @@ class O1:
properties.loss_scale = "dynamic"
properties.flatten_model_params = False
properties.flatten_master_params = False
properties.fused_optimizer = False
properties.enable_ddp_interop = False
return properties # modified in place so this isn't really necessary
......@@ -128,7 +137,8 @@ class O0:
"may still be requested.\n"
def __call__(self, properties):
properties.opt_level = "O0",
properties.enabled = True
properties.opt_level = "O0"
properties.cast_model_type = torch.float32
properties.cast_torch_functions = False
properties.cast_batchnorm = False
......@@ -136,6 +146,7 @@ class O0:
properties.loss_scale = 1.0
properties.flatten_model_params = False
properties.flatten_master_params = False
properties.fused_optimizer = False
properties.enable_ddp_interop = False
return properties # modified in place so this isn't really necessary
......@@ -162,47 +173,49 @@ def check_params_fp32(model):
# allow user to directly pass Properties struct as well?
def register(enabled=False,
optimizers=None,
models=None,
opt_level=None,
cast_model_type=None,
cast_torch_functions=None,
cast_batchnorm=None,
master_weights=None,
loss_scale=None,
flatten_model_params=None,
flatten_master_params=None,
enable_ddp_interop=None):
def register(models, optimizers, enabled=True, opt_level=None, **kwargs):
"""
Expected kwargs:
opt_level=None,
cast_model_type=None,
cast_torch_functions=None,
cast_batchnorm=None,
master_weights=None,
loss_scale=None,
flatten_model_params=None,
flatten_master_params=None,
enable_ddp_interop=None):
"""
if not enabled:
return
return models, optimizers
if opt_level not in opt_levels:
raise RuntimeError("Unexpected optimization level. Options are 'O0', 'O1', 'O2', 'O3'.")
raise RuntimeError(
"Unexpected optimization level {}. ".format(opt_level) +
"Options are 'O0', 'O1', 'O2', 'O3'.")
else:
amp.opt_properties = opt_levels[opt_level](Properties())
print("Selected optimization level {}", opt_levels[opt_level].brief)
_amp_state.opt_properties = opt_levels[opt_level](Properties())
print("Selected optimization level {}".format(opt_levels[opt_level].brief))
print("Defaults for this optimization level are:")
for k, v in amp.opt_properties.options:
print("{:20} : {}", k, v)
for model in models:
check_params_fp32(model)
print(_amp_state.opt_properties.options)
for k, v in _amp_state.opt_properties.options.items():
print("{:20} : {}".format(k, v))
print("Processing user overrides (additional kwargs that are not None)...")
for k, v in kwargs:
for k, v in kwargs.items():
if k not in _amp_state.opt_properties.options:
raise RuntimeError("Unexpected kwarg {}".format(k))
if v is not None:
setattr(amp.opt_properties, k, v)
setattr(_amp_state.opt_properties, k, v)
print("After processing overrides, optimization options are:")
for k, v in amp.opt_properties.options:
print("{:20} : {}", k, v)
for k, v in _amp_state.opt_properties.options.items():
print("{:20} : {}".format(k, v))
return initialize(optimizers, models)
return _initialize(models, optimizers, _amp_state.opt_properties)
def check_option_consistency(enabled=False,
def check_option_consistency(enabled=True,
opt_level=None,
cast_model_type=None,
cast_torch_functions=None,
......@@ -230,13 +243,15 @@ def check_option_consistency(enabled=False,
print("Selected optimization level {}", opt_levels[opt_level].brief)
print("Defaults for this optimization level are:")
for k, v in opt_properties.options:
print("{:20} : {}", k, v)
print("{:20} : {}".format(k, v))
print("Processing user overrides (additional kwargs that are not None)...")
for k, v in kwargs:
if k not in amp_state.opt_properties.options:
raise RuntimeError("Unexpected kwarg {}".format(k))
if v is not None:
setattr(opt_properties, k, v)
print("After processing overrides, optimization options are:")
for k, v in opt_properties.options:
print("{:20} : {}", k, v)
print("{:20} : {}".format(k, v))
......@@ -5,6 +5,64 @@ import warnings
from . import utils
from .opt import OptimWrapper
from .scaler import LossScaler, iter_params
from ._amp_state import _amp_state
from ..fp16_utils import FP16_Optimizer
# There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
@contextlib.contextmanager
def scale_loss(loss,
optimizer,
model=None,
delay_unscale=False):
if not _amp_state.opt_properties.enabled:
yield loss
return
if optimizer.loss_scaler is None:
raise RuntimeError("optimizer passed to scale_loss does not have a loss_scaler.")
loss_scale = optimizer.loss_scaler.loss_scale()
if ((not _amp_state.opt_properties.master_weights)
and (not optimizer.loss_scaler.dynamic)
and loss_scale == 1.0):
yield loss
# Needing to drop the cache here as well is an ugly gotcha.
# But for now I think it's necessary to short-circuit.
# Probably ok to skip this if not delay_unscale
if _amp_state.opt_properties.cast_torch_functions:
_amp_state.handle._clear_cache()
return
yield loss*loss_scale
# this isn't pretty but it unifies things. Once I deprecate the old API entirely,
# I will have freedom to clean this up. Maybe instead of wrapping optimizers,
# I can simply construct a set of attributes (e.g. master params) and assign them
# directly to optimizer instances.
if not delay_unscale:
if isinstance(optimizer, FP16_Optimizer):
optimizer.update_master_grads()
else:
optimizer.loss_scaler.unscale(
iter_params(optimizer.param_groups),
iter_params(optimizer.param_groups),
loss_scale)
# If overflow_check_on_cpu is False, should_skip will always be False.
should_skip = optimizer.loss_scaler.update_scale()
if should_skip:
optimizer_step = optimizer.step
def skip_step():
logger = logging.getLogger('apex.amp')
logger.warning('Gradient overflow, skipping update')
optimizer.step = optimizer_step
optimizer.step = skip_step
# Probably ok to skip this if not delay_unscale
if _amp_state.opt_properties.cast_torch_functions:
_amp_state.handle._clear_cache()
class AmpHandle(object):
def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=False):
......@@ -43,10 +101,11 @@ class AmpHandle(object):
loss_scale = self._default_scaler.loss_scale()
yield loss * loss_scale
should_skip = self._default_scaler.unscale_and_update(
self._default_scaler.unscale(
iter_params(optimizer.param_groups),
iter_params(optimizer.param_groups),
loss_scale)
should_skip = self._default_scaler.update_scale()
if should_skip:
optimizer_step = optimizer.step
def skip_step():
......@@ -108,5 +167,8 @@ class NoOpHandle(object):
def verbose(self):
return False
def _clear_cache(self):
pass
def _deactivate(self):
pass
......@@ -2,6 +2,25 @@ import torch
from torch._six import container_abcs, string_classes
import functools
from apex.fp16_utils import convert_network
from ._amp_state import _amp_state
from .scaler import LossScaler
from ..fp16_utils import FP16_Optimizer
def check_params_fp32(model):
for name, param in model.named_parameters():
if param.is_floating_point() and param.type() != "torch.cuda.FloatTensor":
print("Warning: Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
"When using amp.register, you do not need to call .half() on your model\n"
"before passing it, no matter what optimization level you choose.".format(
name, param.type()))
for name, buf in model.named_buffers():
if buf.is_floating_point() and buf.type() != "torch.cuda.FloatTensor":
print("Warning: Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
"When using amp.register, you do not need to call .half() on your model\n"
"before passing it, no matter what optimization level you choose.".format(
name, buf.type()))
def to_type(dtype, t):
......@@ -11,7 +30,7 @@ def to_type(dtype, t):
print("Warning: input data requires grad. Since input data is not a model parameter,\n"
"its gradients will not be properly allreduced by DDP.")
if t.is_floating_point():
return t.half()
return t.to(dtype)
return t
......@@ -29,13 +48,47 @@ def applier(value, fn):
return value
def initialize(optimizers, models, properties):
def _initialize(models, optimizers, properties):
from apex.parallel import DistributedDataParallel as apex_DDP
from .amp import init as amp_init
if isinstance(optimizers, torch.optim.Optimizer):
optimizers_was_list = False
optimizers = [optimizers]
elif isinstance(optimizers, list):
optimizers_was_list = True
else:
raise TypeError("optimizers must be either a single optimizer or a list of optimizers.")
if isinstance(models, torch.nn.Module):
models_was_list = False
models = [models]
elif isinstance(models, list):
models_was_list = True
else:
raise TypeError("models must be either a single model or a list of models.")
for model in models:
parallel_type = None
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
parallel_type = "torch.nn.parallel.DistributedDataParallel"
if isinstance(model, apex_DDP):
parallel_type = "apex.parallel.DistributedDataParallel"
if isinstance(model, torch.nn.parallel.DataParallel):
parallel_type = "torch.nn.parallel.DataParallel"
if parallel_type is not None:
raise RuntimeError("Incoming model is an instance of {}. ".format(parallel_type) +
"Parallel wrappers should only be applied AFTER the model(s) have been "
"returned from amp.register.")
for model in models:
check_params_fp32(model)
# Stash master weights before casting the model.
# if properties.master_weights:
if properties.cast_model_type is not None:
if properties.cast_batchnorm is not None:
if properties.cast_model_type:
if properties.cast_batchnorm:
for model in models:
convert_network(model, properties.cast_model_type)
else:
......@@ -50,7 +103,7 @@ def initialize(optimizers, models, properties):
return old_fwd(*applier(args, caster),
**applier(kwargs, caster))
return new_fwd
model.forward = patch_forward(model.forward)
# State dict trick to recast any preexisting per-param state tensors
......@@ -60,11 +113,23 @@ def initialize(optimizers, models, properties):
if properties.master_weights:
for i, optimizer in enumerate(optimizers):
if properties.loss_scale == "dynamic":
optimizers[i] = FP16_Optimizer(optimizer[i], dynamic_loss_scale=True)
optimizers[i] = FP16_Optimizer(optimizers[i], dynamic_loss_scale=True)
else:
optimizers[i] = FP16_Optimizer(optimizer[i], static_loss_scale=properties.loss_scale)
optimizers[i] = FP16_Optimizer(optimizers[i], static_loss_scale=properties.loss_scale)
else:
for optimizer in optimizers:
optimizer.loss_scaler = LossScaler(properties.loss_scale)
if properties.cast_torch_functions:
handle = amp.init() # the handle is also globally accessible as amp._DECORATOR_HANDLE
handle = amp_init(loss_scale=properties.loss_scale)
return optimizers, models
if optimizers_was_list:
if models_was_list:
return models, optimizers
else:
return models[0], optimizers
else:
if models_was_list:
return models, optimizers[0]
else:
return models[0], optimizers[0]
import torch
from amp_C import prep_multi_tensor_launch
class MultiTensorApply(object):
available = False
warned = False
def __init__(self, max_blocks, max_tensors, max_depth, chunk_size):
self.chunk_size = chunk_size
self.reallocate(max_blocks, max_tensors, max_depth)
try:
import amp_C
MultiTensorApply.available = True
MultiTensorApply.prep_multi_tensor_launch = amp_C.prep_multi_tensor_launch
self.chunk_size = chunk_size
self.reallocate(max_blocks, max_tensors, max_depth)
except ImportError as err:
MultiTensorApply.availble = False
MultiTensorApply.import_err = err
def check_avail(self):
if MultiTensorApply.available == False:
raise RuntimeError(
"Attempted to call MultiTensorApply method, but MultiTensorApply "
"is not available, possibly because Apex was installed without "
"--cpp_ext --cuda_ext. Original import error message:",
MultiTensorApply.import_err)
def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
self.assign_blocks(tensor_lists)
self.check_avail()
assert len(tensor_lists) > 0, "len(tensor_lists) = {}".format(len(tensor_lists))
len0 = len(tensor_lists[0])
assert len0 > 0, "len(tensor_lists[0]) = {}".format(len0)
for i, l in enumerate(tensor_lists):
assert len(tensor_lists[i]) == len0,\
"len(tensor_lists[{}] = {}, len(tensor_lists[0] = {}".format(
len(tensor_lists[i]), len(tensor_lists[0]))
self.assign_blocks(tensor_lists)
# print(self.gpu_block_to_tensor)
# print(self.gpu_block_to_chunk)
# print(self.gpu_tensor_sizes)
......@@ -16,11 +42,11 @@ class MultiTensorApply(object):
return op(self.nblocks,
noop_flag_buffer,
self.cpu_tensor_addresses,
self.gpu_block_to_tensor,
self.gpu_block_to_tensor,
self.gpu_block_to_chunk,
self.gpu_tensor_sizes,
self.gpu_tensor_addresses,
self.chunk_size,
self.chunk_size,
tensor_lists,
*args)
......@@ -30,6 +56,8 @@ class MultiTensorApply(object):
# print(self.gpu_tensor_addresses)
def assign_blocks(self, tensor_lists):
self.check_avail()
needs_reallocate = False
# Currently, this loop appears prohibitively expensive.
......@@ -38,7 +66,7 @@ class MultiTensorApply(object):
# list0 = tensor_lists[0]
# self.nblocks = 0
# for t, tensor in enumerate(list0):
# blocks_this_tensor = (tensor.numel() +
# blocks_this_tensor = (tensor.numel() +
# self.chunk_size - 1)//self.chunk_size
# if not needs_reallocate:
# self.cpu_tensor_sizes[t] = tensor.numel()
......@@ -49,20 +77,21 @@ class MultiTensorApply(object):
# self.cpu_block_to_tensor[self.nblocks] = t
# self.cpu_block_to_chunk[self.nblocks] = chunk
# self.nblocks += 1
needs_reallocate, self.nblocks = prep_multi_tensor_launch(self.cpu_block_to_tensor,
self.cpu_block_to_chunk,
self.cpu_tensor_sizes,
self.gpu_block_to_tensor,
self.gpu_block_to_chunk,
self.gpu_tensor_sizes,
self.chunk_size,
self.max_depth,
self.max_tensors,
self.max_blocks,
tensor_lists)
needs_reallocate, self.nblocks = MultiTensorApply.prep_multi_tensor_launch(
self.cpu_block_to_tensor,
self.cpu_block_to_chunk,
self.cpu_tensor_sizes,
self.gpu_block_to_tensor,
self.gpu_block_to_chunk,
self.gpu_tensor_sizes,
self.chunk_size,
self.max_depth,
self.max_tensors,
self.max_blocks,
tensor_lists)
torch.cuda.nvtx.range_pop()
print(self.nblocks)
# print(self.nblocks)
if self.nblocks > self.max_blocks:
self.max_blocks = self.nblocks
......@@ -73,23 +102,26 @@ class MultiTensorApply(object):
if needs_reallocate:
self.reallocate(self.max_blocks, self.max_tensors, self.max_depth)
needs_reallocate, self.nblocks = prep_multi_tensor_launch(self.cpu_block_to_tensor,
self.cpu_block_to_chunk,
self.cpu_tensor_sizes,
self.gpu_block_to_tensor,
self.gpu_block_to_chunk,
self.gpu_tensor_sizes,
self.chunk_size,
self.max_depth,
self.max_tensors,
self.max_blocks,
tensor_lists)
needs_reallocate, self.nblocks = MultiTensorApply.prep_multi_tensor_launch(
self.cpu_block_to_tensor,
self.cpu_block_to_chunk,
self.cpu_tensor_sizes,
self.gpu_block_to_tensor,
self.gpu_block_to_chunk,
self.gpu_tensor_sizes,
self.chunk_size,
self.max_depth,
self.max_tensors,
self.max_blocks,
tensor_lists)
assert needs_reallocate == 0, "Should not need reallocate on second attempt."
assert self.nblocks <= self.max_blocks, "Should not need to increase blocks again."
def reallocate(self, max_blocks, max_tensors, max_depth):
self.check_avail()
self.max_blocks = max_blocks
self.max_tensors = max_tensors
self.max_tensors = max_tensors
self.max_depth = max_depth
self.cpu_block_to_tensor = torch.IntTensor(max_blocks).pin_memory()
......@@ -101,3 +133,5 @@ class MultiTensorApply(object):
self.gpu_block_to_chunk = torch.cuda.IntTensor(max_blocks)
self.gpu_tensor_sizes = torch.cuda.IntTensor(max_tensors)
self.gpu_tensor_addresses = torch.cuda.LongTensor(max_depth, max_tensors)
multi_tensor_applier = MultiTensorApply(1000, 100, 4, 2048)
......@@ -37,10 +37,11 @@ class OptimWrapper(object):
loss_scale = self._cur_loss_scaler().loss_scale()
yield loss * loss_scale
self._skip_next[self._loss_idx] = self._cur_loss_scaler().unscale_and_update(
self._cur_loss_scaler().unscale(
iter_params(self._optimizer.param_groups),
iter_params(self._optimizer.param_groups),
loss_scale)
self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
self._loss_idx += 1
if len(cached_grads) > 0:
......
import torch
import logging
from .multi_tensor_apply import multi_tensor_applier
from ._amp_state import _amp_state
# from apex_C import scale_check_overflow
def scale_check_overflow_python(model_grad, scale, master_grad):
# Exception handling for 18.04 compatibility
try:
cpu_sum = float(d_grads.float().sum())
cpu_sum = float(model_grad.float().sum())
except RuntimeError as instance:
if "value cannot be converted" not in instance.args[0]:
raise
......@@ -16,9 +18,10 @@ def scale_check_overflow_python(model_grad, scale, master_grad):
return True
if master_grad is not model_grad:
master_grad.copy_(model_grad)
master_grad.mul_(scale)
if scale != 1.0:
master_grad.mul_(scale)
return False
class LossScaler(object):
warned_no_fused_kernel = False
warned_fp16_grad = False
......@@ -39,48 +42,88 @@ class LossScaler(object):
self._scale_seq_len = scale_window
self._unskipped = 0
self._has_overflow = False
try:
self._overflow_buf = torch.cuda.IntTensor([0])
if multi_tensor_applier.available:
import amp_C
LossScaler.has_fused_kernel = True
LossScaler.scale_check_overflow_cuda = amp_C.scale_check_overflow
self._overflow_buf = torch.cuda.IntTensor([0])
except ImportError as err:
LossScaler.has_fused_kernel = multi_tensor_applier.available
LossScaler.multi_tensor_unscale_cuda = amp_C.multi_tensor_unscale
else:
if not LossScaler.warned_no_fused_kernel:
print("Warning: Amp fused downscale kernel is unavailable, possibly because apex "
"was installed without --cuda_ext. Using Python fallback. ImportError was: ",
err)
print("Warning: multi_tensor_applier fused downscale kernel is unavailable, "
"possibly because apex was installed without --cuda_ext --cpp_ext. "
"Using Python fallback. Original ImportError was: ",
multi_tensor_applier.import_err)
LossScaler.has_fused_kernel = False
LossScaler.warned_no_fused_kernel = True
def loss_scale(self):
return self._loss_scale
def unscale_and_update(self, model_params, master_params, scale):
if LossScaler.has_fused_kernel:
self._overflow_buf.zero_()
def unscale_grads_python(self, model_grads, master_grads, scale):
for model, master in zip(model_grads, master_grads):
if model is not None:
if (master.type() != "torch.cuda.FloatTensor"
and not LossScaler.warned_fp16_grad):
logger = logging.getLogger("apex.amp")
logger.warning(
"Attempting to downscale {} grads. ".format(master.type()) +
"Downscaling non-fp32 grads may indicate an error. "
"When using Amp, you don't need to call .half() on your model.")
LossScaler.warned_fp16_grad = True
self._has_overflow = scale_check_overflow_python(
model,
1./scale,
master)
if self._has_overflow and self.dynamic:
break
def unscale(self, model_params, master_params, scale):
self._has_overflow = False
for model, master in zip(model_params, master_params):
if model.grad is not None:
if LossScaler.has_fused_kernel and master.grad.data.type() == "torch.cuda.FloatTensor":
LossScaler.scale_check_overflow_cuda(model.grad.data,
1./scale,
self._overflow_buf,
master.grad.data)
# Lots of defensive list processing going on here. Way more less efficient than
# consuming the iterator directly. Need to examine Python overhead.
model_master_params = [(model, master) for model, master
in zip(model_params, master_params)] # some of these may be None
# Sync the None-ness of model and master params.
all_same = True
for model, master in model_master_params:
if model.grad is None and master.grad is not None:
master.grad = None
if model.grad is not None and master.grad is None:
master.grad = torch.empty_like(master)
if model.grad is not master.grad:
all_same = False
model_grads = [mmp[0].grad.data for mmp in model_master_params if mmp[0].grad is not None]
master_grads = [mmp[1].grad.data for mmp in model_master_params if mmp[1].grad is not None]
if LossScaler.has_fused_kernel:
# The master grads should never be fp16. The kernel can't handle that, so bail out
# and print a warning. This is overly conservative, and maybe we do want to enable
# fast downscaling of fp16 grads eventually.
if any(grad.type() == "torch.cuda.HalfTensor" for grad in master_grads):
self.unscale_grads_python(model_grads, master_grads, scale)
else:
# This is inefficient if opt_level is O1 and loss scale is 1.0. But to elide
# the launch, I would need to make sure the model grads are the master grads.
# The O(N) checks are proliferating...
self._overflow_buf.zero_()
# handle case of opt_level O1 and loss_scale 1.0. There's also some
# special-cased yields in scale_loss to potentially short-circuit earlier.
if scale == 1.0 and all_same and not self.dynamic:
return
else:
if (master.grad.data.type() != "torch.cuda.FloatTensor"
and not LossScaler.warned_fp16_grad):
logger = logging.getLogger("apex.amp")
logger.warning(
"Attempting to downscale {} grads. ".format(master.grad.data.type()) +
"Downscaling non-fp32 grads may indicate an error. "
"When using Amp, you don't need to call .half() on your model.")
LossScaler.warned_fp16_grad = True
self._has_overflow = scale_check_overflow_python(model.grad.data,
1./scale,
master.grad.data)
if self._has_overflow and self.dynamic:
break
multi_tensor_applier(
LossScaler.multi_tensor_unscale_cuda,
self._overflow_buf,
[model_grads, master_grads],
1./scale)
else:
self.unscale_grads_python(model_grads, master_grads, scale)
# Break into multiple param groups so unscale() can be called more that once before updating.
def update_scale(self):
# If the fused kernel is available, we only need one D2H memcopy and sync.
if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
self._has_overflow = self._overflow_buf.item()
......
......@@ -14,3 +14,5 @@ from .fp16util import (
from .fp16_optimizer import FP16_Optimizer
from .loss_scaler import LossScaler, DynamicLossScaler
test = 1
......@@ -39,7 +39,7 @@ class FP16_Optimizer(object):
init_optimizer (torch.optim.optimizer): Existing optimizer created with the parameters to optimize. Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones. :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.
static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale gradients computed by the model. Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any ``static_loss_scale`` option.
dynamic_loss_args (dict, optional, default=None): Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor. Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor. If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
dynamic_loss_args (dict, optional, default=None): Dict of kwargs that will be forwarded to the internal :class:`LossScaler` instance's constructor. Keys of this dict must match kwargs accepted by :class:`LossScaler`'s constructor. If ``dynamic_loss_args`` is unspecified, :class:`LossScaler`'s defaults will be used.
verbose (bool, optional, default=True): By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check. If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``. ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
``init_optimizer`` is expected to have been constructed in the ordinary way.
......@@ -154,6 +154,18 @@ class FP16_Optimizer(object):
self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
self.fp32_from_fp32_groups.append(fp32_params_this_group)
self.all_fp16_params = []
for group in self.fp16_groups:
self.all_fp16_params += group
self.all_fp32_from_fp16_params = []
for group in self.fp32_from_fp16_groups:
self.all_fp32_from_fp16_params += group
self.all_fp32_from_fp32_params = []
for group in self.fp32_from_fp32_groups:
self.all_fp32_from_fp32_params += group
# Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
self.optimizer.load_state_dict(self.optimizer.state_dict())
# alternative way to cast per-param state tensors:
......@@ -210,35 +222,36 @@ class FP16_Optimizer(object):
param.grad.detach_() # as in torch.optim.optimizer.zero_grad()
param.grad.zero_()
def _check_overflow(self):
params = []
for group in self.fp16_groups:
for param in group:
params.append(param)
for group in self.fp32_from_fp32_groups:
for param in group:
params.append(param)
self.overflow = self.loss_scaler.has_overflow(params)
# Should not be used anymore.
# def _check_overflow(self):
# params = []
# for group in self.fp16_groups:
# for param in group:
# params.append(param)
# for group in self.fp32_from_fp32_groups:
# for param in group:
# params.append(param)
# self.overflow = self.loss_scaler.has_overflow(params)
def _update_scale(self, has_overflow=False):
self.loss_scaler.update_scale(has_overflow)
# def _update_scale(self, has_overflow=False):
# self.loss_scaler.update_scale(has_overflow)
def _master_params_to_model_params(self):
for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
master_params_to_model_params(fp16_group, fp32_from_fp16_group)
# To consider: Integrate distributed with this wrapper by registering a hook on each variable
# To consider: Integrate distributed with this wrapper by registering a hook on each variable
# that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
def _model_grads_to_master_grads(self):
for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
# def _model_grads_to_master_grads(self):
# for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
# model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
def _downscale_master(self):
if self.loss_scale != 1.0:
for group in self.optimizer.param_groups:
for param in group['params']:
if param.grad is not None:
param.grad.data.mul_(1./self.loss_scale)
# def _downscale_master(self):
# if self.loss_scale != 1.0:
# for group in self.optimizer.param_groups:
# for param in group['params']:
# if param.grad is not None:
# param.grad.data.mul_(1./self.loss_scale)
def clip_master_grads(self, max_norm, norm_type=2):
"""
......@@ -366,12 +379,15 @@ class FP16_Optimizer(object):
http://pytorch.org/docs/master/optim.html#optimizer-step-closure
"""
scale = self.loss_scaler.loss_scale
self._update_scale(self.overflow)
scale = self.loss_scaler.loss_scale()
# To consider: Should this be in step(), or update_master_grads? It works either way,
# but I should make it consistent with the Amp control flow, which updates the scale
# during backward context manager exit.
# self._update_scale(self.overflow)
if self.overflow:
print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}"
.format(scale, self.loss_scale))
print("OVERFLOW! Skipping step, reducing loss scale to {}".format(
self.loss_scaler.loss_scale()))
return
if closure is not None:
......@@ -409,10 +425,10 @@ class FP16_Optimizer(object):
# closure() and return the loss.
temp_loss = closure()
while(self.overflow):
scale = self.loss_scaler.loss_scale
self._update_scale(self.overflow)
print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, "
"reducing to {}".format(scale, self.loss_scale))
scale = self.loss_scaler.loss_scale()
# self._update_scale(self.overflow) # now done at the end of backward
print("OVERFLOW within closure! Skipping step, reducing loss scale to {}".format(
self.loss_scaler.loss_scale()))
temp_loss = closure()
return temp_loss
......@@ -480,7 +496,8 @@ class FP16_Optimizer(object):
# a loss scale that works. After you find a loss scale that works, do a final dummy
# backward pass with retain_graph=False to tear down the graph. Doing this would avoid
# discarding the iteration, but probably wouldn't improve overall efficiency.
loss.float()*loss_scaler.loss_scale().backward(retain_graph=retain_graph)
scaled_loss = loss.float()*self.loss_scaler.loss_scale()
scaled_loss.backward(retain_graph=retain_graph)
if update_master_grads:
self.update_master_grads()
......@@ -491,11 +508,24 @@ class FP16_Optimizer(object):
updated by the optimizer. :attr:`update_master_grads` only needs to be called if
``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
"""
if self.dynamic_loss_scale:
self._check_overflow()
if self.overflow: return
self._model_grads_to_master_grads()
self._downscale_master()
# if self.dynamic_loss_scale:
# self._check_overflow()
# if self.overflow: return
# self._model_grads_to_master_grads()
# self._downscale_master()
# Use the one-shot multi-tensor apply kernel
if len(self.all_fp16_params) > 0:
self.loss_scaler.unscale(
self.all_fp16_params,
self.all_fp32_from_fp16_params,
self.loss_scaler.loss_scale())
if len(self.all_fp32_from_fp32_params) > 0:
self.loss_scaler.unscale(
self.all_fp32_from_fp32_params,
self.all_fp32_from_fp32_params,
self.loss_scaler.loss_scale())
self.overflow = self.loss_scaler.update_scale()
def inspect_master_grad_data(self):
"""
......@@ -533,10 +563,10 @@ class FP16_Optimizer(object):
# Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
def _get_loss_scale(self):
return self.loss_scaler.loss_scale
return self.loss_scaler.loss_scale()
def _set_loss_scale(self, value):
self.loss_scaler.cur_scale = value
self.loss_scaler._loss_scale = value
loss_scale = property(_get_loss_scale, _set_loss_scale)
......
......@@ -52,7 +52,7 @@ struct UnscaleFunctor
{
incoming_vals[ii] = 0;
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n)
if(i < n && i < chunk_size)
incoming_vals[ii] = static_cast<float>(in[i]);
}
......@@ -60,7 +60,7 @@ struct UnscaleFunctor
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n)
if(i < n && i < chunk_size)
if(isfinite(incoming_vals[ii]))
out[i] = incoming_vals[ii]*scale;
else
......@@ -85,6 +85,8 @@ void multi_tensor_unscale_cuda(
{
using namespace at;
AT_CHECK(nblocks > 0, "nblocks is not > 0");
int addresses_x = gpu_tensor_addresses.size(1);
// <.< >.> i don't see any cops. i'm going to access the pointers directly.
......
......@@ -76,7 +76,7 @@ class TestCache(unittest.TestCase):
param.grad = None
loss = model(self.x).sum()
self.handle._default_scaler._loss_scale = 1.0
self.handle._default_scaler._loss_scale = 4.0
with self.handle.scale_loss(loss, dummy_optimizer) as scaled_loss:
scaled_loss.backward()
......
......@@ -51,6 +51,7 @@ class TestFP16Optimizer(unittest.TestCase):
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_loss_scaling(self):
ref_optim = torch.optim.Adam(self.ref_model.parameters())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment