Commit 59e992da authored by Michael Carilli's avatar Michael Carilli
Browse files

Stashing to test on the cluster

parent 40555b3a
...@@ -2,13 +2,19 @@ ...@@ -2,13 +2,19 @@
# I'm a C++ guy, not a python guy. I decided this approach because it seemed most C++-like. # I'm a C++ guy, not a python guy. I decided this approach because it seemed most C++-like.
# But apparently it's ok: # But apparently it's ok:
# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm # http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
import os
import torch
class AmpState(object): class AmpState(object):
def __init__(self): def __init__(self):
self.hard_override=False self.hard_override=False
self.verbosity=1
# Attribute stash. Could also just stash things as global module attributes. # Attribute stash. Could also just stash things as global module attributes.
_amp_state = AmpState() _amp_state = AmpState()
def warn_or_err(msg): def warn_or_err(msg):
if _amp_state.hard_override: if _amp_state.hard_override:
print("Warning: " + msg) print("Warning: " + msg)
...@@ -18,11 +24,30 @@ def warn_or_err(msg): ...@@ -18,11 +24,30 @@ def warn_or_err(msg):
# + " If you're sure you know what you're doing, supply " + # + " If you're sure you know what you're doing, supply " +
# "hard_override=True to amp.initialize.") # "hard_override=True to amp.initialize.")
distributed = False
if 'WORLD_SIZE' in os.environ:
distributed = int(os.environ['WORLD_SIZE']) > 1
def maybe_print(msg, rank0=False):
if _amp_state.verbosity > 0:
if rank0:
if distributed:
if torch.distributed.get_rank() == 0:
print(msg)
else:
print(msg)
else:
print(msg)
# def iter_params(param_groups): # def iter_params(param_groups):
# for group in param_groups: # for group in param_groups:
# for p in group['params']: # for p in group['params']:
# yield p # yield p
def master_params(optimizer): def master_params(optimizer):
""" """
Generator expression that iterates over the params owned by ``optimizer``. Generator expression that iterates over the params owned by ``optimizer``.
......
import torch import torch
from ._initialize import _initialize from ._initialize import _initialize
from ._amp_state import _amp_state, warn_or_err from ._amp_state import _amp_state, warn_or_err, maybe_print
class Properties(object): class Properties(object):
...@@ -199,7 +199,8 @@ def initialize( ...@@ -199,7 +199,8 @@ def initialize(
patch_torch_functions=None, patch_torch_functions=None,
keep_batchnorm_fp32=None, keep_batchnorm_fp32=None,
master_weights=None, master_weights=None,
loss_scale=None loss_scale=None,
verbosity=1,
): ):
""" """
Initialize your models, optimizers, and the Torch tensor and functional namespace according to the Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
...@@ -219,7 +220,7 @@ def initialize( ...@@ -219,7 +220,7 @@ def initialize(
optimizers (torch.optim.Optimizer or list of torch.optim.Optimizers): Optimizers to modify/cast. optimizers (torch.optim.Optimizer or list of torch.optim.Optimizers): Optimizers to modify/cast.
enabled (bool, optional, default=True): If False, renders all Amp calls no-ops, so your script enabled (bool, optional, default=True): If False, renders all Amp calls no-ops, so your script
should run as if Amp were not present. should run as if Amp were not present.
opt_level(str, required): Pure or mixed precision optimization level. Accepted values are opt_level (str, required): Pure or mixed precision optimization level. Accepted values are
"O0", "O1", "O2", and "O3", explained in detail above. "O0", "O1", "O2", and "O3", explained in detail above.
cast_model_type (``torch.dtype``, optional, default=None): Optional property override, see cast_model_type (``torch.dtype``, optional, default=None): Optional property override, see
above. above.
...@@ -227,8 +228,9 @@ def initialize( ...@@ -227,8 +228,9 @@ def initialize(
keep_batchnorm_fp32 (bool or str, optional, default=None): Optional property override. If keep_batchnorm_fp32 (bool or str, optional, default=None): Optional property override. If
passed as a string, must be the string "True" or "False". passed as a string, must be the string "True" or "False".
master_weights (bool, optional, default=None): Optional property override. master_weights (bool, optional, default=None): Optional property override.
loss_scale(float or str, default=None): Optional property override. If passed as a string, loss_scale (float or str, default=None): Optional property override. If passed as a string,
must be a string representing a number, e.g., "128.0", or the string "dynamic". must be a string representing a number, e.g., "128.0", or the string "dynamic".
verbosity (int, default=1): Set to 0 to suppress Amp-related output.
Returns: Returns:
Model(s) and optimizer(s) modified according to the ``opt_level``. Model(s) and optimizer(s) modified according to the ``opt_level``.
...@@ -266,8 +268,10 @@ def initialize( ...@@ -266,8 +268,10 @@ def initialize(
.. _`Imagenet example`: .. _`Imagenet example`:
https://github.com/NVIDIA/apex/tree/master/examples/imagenet https://github.com/NVIDIA/apex/tree/master/examples/imagenet
""" """
_amp_state.opt_properties = Properties()
_amp_state.opt_properties.verbosity = verbosity
if not enabled: if not enabled:
_amp_state.opt_properties = Properties()
return models, optimizers return models, optimizers
if opt_level not in opt_levels: if opt_level not in opt_levels:
...@@ -275,16 +279,16 @@ def initialize( ...@@ -275,16 +279,16 @@ def initialize(
"Unexpected optimization level {}. ".format(opt_level) + "Unexpected optimization level {}. ".format(opt_level) +
"Options are 'O0', 'O1', 'O2', 'O3'.") "Options are 'O0', 'O1', 'O2', 'O3'.")
else: else:
_amp_state.opt_properties = opt_levels[opt_level](Properties()) _amp_state.opt_properties = opt_levels[opt_level](_amp_state.opt_properties)
print("Selected optimization level {}".format(opt_levels[opt_level].brief)) maybe_print("Selected optimization level {}".format(opt_levels[opt_level].brief), True)
print("Defaults for this optimization level are:") maybe_print("Defaults for this optimization level are:", True)
print(_amp_state.opt_properties.options) maybe_print(_amp_state.opt_properties.options, True)
for k, v in _amp_state.opt_properties.options.items(): for k, v in _amp_state.opt_properties.options.items():
print("{:22} : {}".format(k, v)) maybe_print("{:22} : {}".format(k, v), True)
print("Processing user overrides (additional kwargs that are not None)...") maybe_print("Processing user overrides (additional kwargs that are not None)...", True)
# I chose to have the keyword arguments listed directly in the argument list, so I # I chose to have the keyword arguments listed directly in the argument list,
# can't use kwargs.items() here. # instead of **kwargs, so I can't use kwargs.items() here.
if enabled is not None: if enabled is not None:
_amp_state.opt_properties.enabled = enabled _amp_state.opt_properties.enabled = enabled
if opt_level is not None: if opt_level is not None:
...@@ -300,9 +304,9 @@ def initialize( ...@@ -300,9 +304,9 @@ def initialize(
if loss_scale is not None: if loss_scale is not None:
_amp_state.opt_properties.loss_scale = loss_scale _amp_state.opt_properties.loss_scale = loss_scale
print("After processing overrides, optimization options are:") maybe_print("After processing overrides, optimization options are:", True)
for k, v in _amp_state.opt_properties.options.items(): for k, v in _amp_state.opt_properties.options.items():
print("{:22} : {}".format(k, v)) maybe_print("{:22} : {}".format(k, v), True)
return _initialize(models, optimizers, _amp_state.opt_properties) return _initialize(models, optimizers, _amp_state.opt_properties)
......
import contextlib import contextlib
import logging
import warnings import warnings
import torch import torch
from . import utils from . import utils
from .opt import OptimWrapper from .opt import OptimWrapper
from .scaler import LossScaler from .scaler import LossScaler
from ._amp_state import _amp_state, master_params from ._amp_state import _amp_state, master_params, maybe_print
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
...@@ -106,9 +105,8 @@ def scale_loss(loss, ...@@ -106,9 +105,8 @@ def scale_loss(loss,
if should_skip: if should_skip:
optimizer_step = optimizer.step optimizer_step = optimizer.step
def skip_step(): def skip_step():
logger = logging.getLogger('apex.amp') maybe_print("Gradient overflow. Skipping step, reducing " +
logger.warning("Gradient overflow. Skipping step, reducing " + "loss scale to {}".format(optimizer.loss_scaler.loss_scale()))
"loss scale to {}".format(optimizer.loss_scaler.loss_scale()))
optimizer.step = optimizer_step optimizer.step = optimizer_step
optimizer.step = skip_step optimizer.step = skip_step
# Probably ok to skip this if not delay_unscale # Probably ok to skip this if not delay_unscale
...@@ -171,8 +169,7 @@ class AmpHandle(object): ...@@ -171,8 +169,7 @@ class AmpHandle(object):
if should_skip: if should_skip:
optimizer_step = optimizer.step optimizer_step = optimizer.step
def skip_step(): def skip_step():
logger = logging.getLogger('apex.amp') maybe_print('Gradient overflow, skipping update')
logger.warning('Gradient overflow, skipping update')
optimizer.step = optimizer_step optimizer.step = optimizer_step
optimizer.step = skip_step optimizer.step = skip_step
......
import contextlib import contextlib
import logging
import warnings import warnings
from .scaler import LossScaler, master_params from .scaler import LossScaler, master_params
from ._amp_state import maybe_print
import numpy as np import numpy as np
...@@ -71,8 +71,7 @@ class OptimWrapper(object): ...@@ -71,8 +71,7 @@ class OptimWrapper(object):
'The `closure` argument is unsupported by the amp ' + 'The `closure` argument is unsupported by the amp ' +
'optimizer wrapper.') 'optimizer wrapper.')
if any(self._skip_next): if any(self._skip_next):
logger = logging.getLogger('apex.amp') maybe_print('Gradient overflow, skipping update')
logger.info('Gradient overflow, skipping update')
self._skip_next = [False] * self._num_loss self._skip_next = [False] * self._num_loss
else: else:
return self._optimizer.step(closure=closure) return self._optimizer.step(closure=closure)
......
import torch import torch
import logging
from ..multi_tensor_apply import multi_tensor_applier from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import _amp_state, master_params from ._amp_state import _amp_state, master_params, maybe_print
from itertools import product from itertools import product
# from apex_C import scale_check_overflow # from apex_C import scale_check_overflow
...@@ -46,10 +45,12 @@ class LossScaler(object): ...@@ -46,10 +45,12 @@ class LossScaler(object):
LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
else: else:
if not LossScaler.warned_no_fused_kernel: if not LossScaler.warned_no_fused_kernel:
print("Warning: multi_tensor_applier fused unscale kernel is unavailable, " maybe_print(
"possibly because apex was installed without --cuda_ext --cpp_ext. " "Warning: multi_tensor_applier fused unscale kernel is unavailable, "
"Using Python fallback. Original ImportError was: ", "possibly because apex was installed without --cuda_ext --cpp_ext. "
multi_tensor_applier.import_err) "Using Python fallback. Original ImportError was: " +
multi_tensor_applier.import_err,
True)
LossScaler.has_fused_kernel = False LossScaler.has_fused_kernel = False
LossScaler.warned_no_fused_kernel = True LossScaler.warned_no_fused_kernel = True
...@@ -61,8 +62,7 @@ class LossScaler(object): ...@@ -61,8 +62,7 @@ class LossScaler(object):
if model is not None: if model is not None:
if not LossScaler.warned_unscaling_non_fp32_grad: if not LossScaler.warned_unscaling_non_fp32_grad:
if master.type() != "torch.cuda.FloatTensor": if master.type() != "torch.cuda.FloatTensor":
logger = logging.getLogger("apex.amp") maybe_print(
logger.warning(
"Attempting to unscale a grad with type {} ".format(master.type()) + "Attempting to unscale a grad with type {} ".format(master.type()) +
"Unscaling non-fp32 grads may indicate an error. " "Unscaling non-fp32 grads may indicate an error. "
"When using Amp, you don't need to call .half() on your model.") "When using Amp, you don't need to call .half() on your model.")
......
...@@ -4,6 +4,7 @@ from torch.autograd import Variable ...@@ -4,6 +4,7 @@ from torch.autograd import Variable
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from ..amp._amp_state import _amp_state, maybe_print
from ..amp.scaler import LossScaler from ..amp.scaler import LossScaler
from ..multi_tensor_apply import multi_tensor_applier from ..multi_tensor_apply import multi_tensor_applier
from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
...@@ -193,6 +194,8 @@ class FP16_Optimizer(object): ...@@ -193,6 +194,8 @@ class FP16_Optimizer(object):
self.multi_tensor_scale = amp_C.multi_tensor_scale self.multi_tensor_scale = amp_C.multi_tensor_scale
self._dummy_overflow_buf = torch.cuda.IntTensor([0]); self._dummy_overflow_buf = torch.cuda.IntTensor([0]);
# Having self.maybe_print distinct from _amp_state.maybe_print is another artifact
# of having to support FP16_Optimizer separately, for the time being.
def maybe_print(self, msg): def maybe_print(self, msg):
if self.verbose: if self.verbose:
print(msg) print(msg)
...@@ -401,8 +404,9 @@ class FP16_Optimizer(object): ...@@ -401,8 +404,9 @@ class FP16_Optimizer(object):
# self._update_scale(self.overflow) # self._update_scale(self.overflow)
if self.overflow: if self.overflow:
print("Gradient overflow. Skipping step, reducing " + # Using _amp_state.maybe_print instead of self.print here is intentional.
"loss scale to {}".format(self.loss_scaler.loss_scale())) maybe_print("Gradient overflow. Skipping step, reducing " +
"loss scale to {}".format(self.loss_scaler.loss_scale()))
return return
if closure is not None: if closure is not None:
......
...@@ -6,8 +6,14 @@ print_banner() { ...@@ -6,8 +6,14 @@ print_banner() {
print_banner "Distributed status: $1" print_banner "Distributed status: $1"
# DATADIR="/home/mcarilli/Desktop/pt18data/apex/examples/imagenet/bare_metal_train_val/" echo $2
DATADIR="/opt/home/apex/examples/imagenet/" if [ -n "$2" ]
then
DATADIR="$2"
else
# DATADIR="/home/mcarilli/Desktop/pt18data/apex/examples/imagenet/bare_metal_train_val/"
DATADIR="/opt/home/apex/examples/imagenet/"
fi
if [ "$1" == "single_gpu" ] if [ "$1" == "single_gpu" ]
then then
......
#!/bin/bash #!/bin/bash
cp ../common/* . cp ../common/* .
bash run_test.sh distributed bash run_test.sh distributed $1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment