".github/git@developer.sourcefind.cn:OpenDAS/torchaudio.git" did not exist on "25e96f4246115a3deeec0afbd4ac52c47c0fa934"
Commit 59e992da authored by Michael Carilli's avatar Michael Carilli
Browse files

Stashing to test on the cluster

parent 40555b3a
......@@ -2,13 +2,19 @@
# I'm a C++ guy, not a python guy. I decided this approach because it seemed most C++-like.
# But apparently it's ok:
# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
import os
import torch
class AmpState(object):
def __init__(self):
self.hard_override=False
self.verbosity=1
# Attribute stash. Could also just stash things as global module attributes.
_amp_state = AmpState()
def warn_or_err(msg):
if _amp_state.hard_override:
print("Warning: " + msg)
......@@ -18,11 +24,30 @@ def warn_or_err(msg):
# + " If you're sure you know what you're doing, supply " +
# "hard_override=True to amp.initialize.")
distributed = False
if 'WORLD_SIZE' in os.environ:
distributed = int(os.environ['WORLD_SIZE']) > 1
def maybe_print(msg, rank0=False):
if _amp_state.verbosity > 0:
if rank0:
if distributed:
if torch.distributed.get_rank() == 0:
print(msg)
else:
print(msg)
else:
print(msg)
# def iter_params(param_groups):
# for group in param_groups:
# for p in group['params']:
# yield p
def master_params(optimizer):
"""
Generator expression that iterates over the params owned by ``optimizer``.
......
import torch
from ._initialize import _initialize
from ._amp_state import _amp_state, warn_or_err
from ._amp_state import _amp_state, warn_or_err, maybe_print
class Properties(object):
......@@ -199,7 +199,8 @@ def initialize(
patch_torch_functions=None,
keep_batchnorm_fp32=None,
master_weights=None,
loss_scale=None
loss_scale=None,
verbosity=1,
):
"""
Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
......@@ -219,7 +220,7 @@ def initialize(
optimizers (torch.optim.Optimizer or list of torch.optim.Optimizers): Optimizers to modify/cast.
enabled (bool, optional, default=True): If False, renders all Amp calls no-ops, so your script
should run as if Amp were not present.
opt_level(str, required): Pure or mixed precision optimization level. Accepted values are
opt_level (str, required): Pure or mixed precision optimization level. Accepted values are
"O0", "O1", "O2", and "O3", explained in detail above.
cast_model_type (``torch.dtype``, optional, default=None): Optional property override, see
above.
......@@ -227,8 +228,9 @@ def initialize(
keep_batchnorm_fp32 (bool or str, optional, default=None): Optional property override. If
passed as a string, must be the string "True" or "False".
master_weights (bool, optional, default=None): Optional property override.
loss_scale(float or str, default=None): Optional property override. If passed as a string,
loss_scale (float or str, default=None): Optional property override. If passed as a string,
must be a string representing a number, e.g., "128.0", or the string "dynamic".
verbosity (int, default=1): Set to 0 to suppress Amp-related output.
Returns:
Model(s) and optimizer(s) modified according to the ``opt_level``.
......@@ -266,8 +268,10 @@ def initialize(
.. _`Imagenet example`:
https://github.com/NVIDIA/apex/tree/master/examples/imagenet
"""
_amp_state.opt_properties = Properties()
_amp_state.opt_properties.verbosity = verbosity
if not enabled:
_amp_state.opt_properties = Properties()
return models, optimizers
if opt_level not in opt_levels:
......@@ -275,16 +279,16 @@ def initialize(
"Unexpected optimization level {}. ".format(opt_level) +
"Options are 'O0', 'O1', 'O2', 'O3'.")
else:
_amp_state.opt_properties = opt_levels[opt_level](Properties())
print("Selected optimization level {}".format(opt_levels[opt_level].brief))
print("Defaults for this optimization level are:")
print(_amp_state.opt_properties.options)
_amp_state.opt_properties = opt_levels[opt_level](_amp_state.opt_properties)
maybe_print("Selected optimization level {}".format(opt_levels[opt_level].brief), True)
maybe_print("Defaults for this optimization level are:", True)
maybe_print(_amp_state.opt_properties.options, True)
for k, v in _amp_state.opt_properties.options.items():
print("{:22} : {}".format(k, v))
maybe_print("{:22} : {}".format(k, v), True)
print("Processing user overrides (additional kwargs that are not None)...")
# I chose to have the keyword arguments listed directly in the argument list, so I
# can't use kwargs.items() here.
maybe_print("Processing user overrides (additional kwargs that are not None)...", True)
# I chose to have the keyword arguments listed directly in the argument list,
# instead of **kwargs, so I can't use kwargs.items() here.
if enabled is not None:
_amp_state.opt_properties.enabled = enabled
if opt_level is not None:
......@@ -300,9 +304,9 @@ def initialize(
if loss_scale is not None:
_amp_state.opt_properties.loss_scale = loss_scale
print("After processing overrides, optimization options are:")
maybe_print("After processing overrides, optimization options are:", True)
for k, v in _amp_state.opt_properties.options.items():
print("{:22} : {}".format(k, v))
maybe_print("{:22} : {}".format(k, v), True)
return _initialize(models, optimizers, _amp_state.opt_properties)
......
import contextlib
import logging
import warnings
import torch
from . import utils
from .opt import OptimWrapper
from .scaler import LossScaler
from ._amp_state import _amp_state, master_params
from ._amp_state import _amp_state, master_params, maybe_print
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
......@@ -106,9 +105,8 @@ def scale_loss(loss,
if should_skip:
optimizer_step = optimizer.step
def skip_step():
logger = logging.getLogger('apex.amp')
logger.warning("Gradient overflow. Skipping step, reducing " +
"loss scale to {}".format(optimizer.loss_scaler.loss_scale()))
maybe_print("Gradient overflow. Skipping step, reducing " +
"loss scale to {}".format(optimizer.loss_scaler.loss_scale()))
optimizer.step = optimizer_step
optimizer.step = skip_step
# Probably ok to skip this if not delay_unscale
......@@ -171,8 +169,7 @@ class AmpHandle(object):
if should_skip:
optimizer_step = optimizer.step
def skip_step():
logger = logging.getLogger('apex.amp')
logger.warning('Gradient overflow, skipping update')
maybe_print('Gradient overflow, skipping update')
optimizer.step = optimizer_step
optimizer.step = skip_step
......
import contextlib
import logging
import warnings
from .scaler import LossScaler, master_params
from ._amp_state import maybe_print
import numpy as np
......@@ -71,8 +71,7 @@ class OptimWrapper(object):
'The `closure` argument is unsupported by the amp ' +
'optimizer wrapper.')
if any(self._skip_next):
logger = logging.getLogger('apex.amp')
logger.info('Gradient overflow, skipping update')
maybe_print('Gradient overflow, skipping update')
self._skip_next = [False] * self._num_loss
else:
return self._optimizer.step(closure=closure)
......
import torch
import logging
from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import _amp_state, master_params
from ._amp_state import _amp_state, master_params, maybe_print
from itertools import product
# from apex_C import scale_check_overflow
......@@ -46,10 +45,12 @@ class LossScaler(object):
LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
else:
if not LossScaler.warned_no_fused_kernel:
print("Warning: multi_tensor_applier fused unscale kernel is unavailable, "
"possibly because apex was installed without --cuda_ext --cpp_ext. "
"Using Python fallback. Original ImportError was: ",
multi_tensor_applier.import_err)
maybe_print(
"Warning: multi_tensor_applier fused unscale kernel is unavailable, "
"possibly because apex was installed without --cuda_ext --cpp_ext. "
"Using Python fallback. Original ImportError was: " +
multi_tensor_applier.import_err,
True)
LossScaler.has_fused_kernel = False
LossScaler.warned_no_fused_kernel = True
......@@ -61,8 +62,7 @@ class LossScaler(object):
if model is not None:
if not LossScaler.warned_unscaling_non_fp32_grad:
if master.type() != "torch.cuda.FloatTensor":
logger = logging.getLogger("apex.amp")
logger.warning(
maybe_print(
"Attempting to unscale a grad with type {} ".format(master.type()) +
"Unscaling non-fp32 grads may indicate an error. "
"When using Amp, you don't need to call .half() on your model.")
......
......@@ -4,6 +4,7 @@ from torch.autograd import Variable
from torch.nn.parameter import Parameter
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from ..amp._amp_state import _amp_state, maybe_print
from ..amp.scaler import LossScaler
from ..multi_tensor_apply import multi_tensor_applier
from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
......@@ -193,6 +194,8 @@ class FP16_Optimizer(object):
self.multi_tensor_scale = amp_C.multi_tensor_scale
self._dummy_overflow_buf = torch.cuda.IntTensor([0]);
# Having self.maybe_print distinct from _amp_state.maybe_print is another artifact
# of having to support FP16_Optimizer separately, for the time being.
def maybe_print(self, msg):
if self.verbose:
print(msg)
......@@ -401,8 +404,9 @@ class FP16_Optimizer(object):
# self._update_scale(self.overflow)
if self.overflow:
print("Gradient overflow. Skipping step, reducing " +
"loss scale to {}".format(self.loss_scaler.loss_scale()))
# Using _amp_state.maybe_print instead of self.print here is intentional.
maybe_print("Gradient overflow. Skipping step, reducing " +
"loss scale to {}".format(self.loss_scaler.loss_scale()))
return
if closure is not None:
......
......@@ -6,8 +6,14 @@ print_banner() {
print_banner "Distributed status: $1"
# DATADIR="/home/mcarilli/Desktop/pt18data/apex/examples/imagenet/bare_metal_train_val/"
DATADIR="/opt/home/apex/examples/imagenet/"
echo $2
if [ -n "$2" ]
then
DATADIR="$2"
else
# DATADIR="/home/mcarilli/Desktop/pt18data/apex/examples/imagenet/bare_metal_train_val/"
DATADIR="/opt/home/apex/examples/imagenet/"
fi
if [ "$1" == "single_gpu" ]
then
......
#!/bin/bash
cp ../common/* .
bash run_test.sh distributed
bash run_test.sh distributed $1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment