Commit c142714b authored by Michael Carilli's avatar Michael Carilli
Browse files

Merging in latest master changes

parents b620f96b e6eec3ba
......@@ -37,7 +37,7 @@ optimized for NVIDIA's NCCL communication library.
[Python Source](https://github.com/NVIDIA/apex/tree/master/apex/parallel)
[Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/distributed)
[Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed)
The [Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
shows use of `apex.parallel.DistributedDataParallel` along with `apex.amp`.
......@@ -69,6 +69,8 @@ It's often convenient to use Apex in Docker containers. Compatible options incl
* [NVIDIA Pytorch containers from NGC](https://ngc.nvidia.com/catalog/containers/nvidia%2Fpytorch), which come with Apex preinstalled. To use the latest Amp API, you may need to `pip uninstall apex` then reinstall Apex using the **Quick Start** commands below.
* [official Pytorch -devel Dockerfiles](https://hub.docker.com/r/pytorch/pytorch/tags), e.g. `docker pull pytorch/pytorch:nightly-devel-cuda10.0-cudnn7`, in which you can install Apex using the **Quick Start** commands.
See the [Docker example folder](https://github.com/NVIDIA/apex/tree/master/examples/docker) for details.
# Quick Start
### Linux
......
......@@ -12,6 +12,7 @@ from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
from ..optimizers import FusedAdam
from ..parallel import DistributedDataParallel as apex_DDP
from ..parallel.LARC import LARC
def to_type(dtype, t):
......@@ -126,7 +127,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
from .amp import init as amp_init
optimizers_was_list = False
if isinstance(optimizers, torch.optim.Optimizer):
if isinstance(optimizers, torch.optim.Optimizer) or isinstance(optimizers, LARC):
optimizers = [optimizers]
elif optimizers is None:
optimizers = []
......
......@@ -463,4 +463,77 @@ def _process_optimizer(optimizer, properties):
optimizer._amp_lazy_init = types.MethodType(_amp_lazy_init, optimizer)
old_add_param_group = optimizer.add_param_group
def new_add_param_group(self, new_group):
stash = self._amp_stash
assert isinstance(new_group, dict), "param group must be a dict"
new_params = new_group['params']
if isinstance(new_params, torch.Tensor):
new_group['params'] = [new_params]
elif isinstance(new_params, set):
raise TypeError('optimizer parameters need to be organized in ordered collections, but '
'the ordering of tensors in sets will change between runs. Please use a list instead.')
else:
new_group['params'] = list(new_params)
if properties.master_weights:
# Mutate new_group in-place to use FP32 master params
fp16_params_this_group = []
fp32_params_this_group = []
fp32_from_fp16_params_this_group = []
for i, param in enumerate(new_group['params']):
if param.requires_grad:
if param.type() == 'torch.cuda.HalfTensor':
fp16_params_this_group.append(param)
master_param = param.detach().clone().float()
master_param.requires_grad = True
new_group['params'][i] = master_param
fp32_from_fp16_params_this_group.append(master_param)
elif param.type() == 'torch.cuda.FloatTensor':
fp32_params_this_group.append(param)
new_group['params'][i] = param
else:
raise TypeError("Optimizer's parameters must be either "
"torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
"Received {}".format(param.type()))
stash.fp16_groups.append(fp16_params_this_group)
stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
stash.fp32_from_fp32_groups.append(fp32_params_this_group)
stash.all_fp16_params += fp16_params_this_group
stash.all_fp32_from_fp16_params += fp32_from_fp16_params_this_group
stash.all_fp32_from_fp32_params += fp32_params_this_group
# stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
stash.all_fp32_from_fp32_grad_stash += [None for _ in fp32_params_this_group]
# It should be ok to let params be added with existing .grad attributes.
# for param in fp16_params_this_group:
# param.grad = None
# for param in fp32_from_fp16_params_this_group:
# param.grad = None
# for param in stash.fp32_params_this_group:
# param.grad = None
else:
for param in new_group['params']:
if param.type() == 'torch.cuda.HalfTensor':
stash.all_fp16_params.append(param)
stash.all_fp16_grad_stash.append(None)
elif param.type() == 'torch.cuda.FloatTensor':
stash.all_fp32_params.append(param)
stash.all_fp32_grad_stash.append(None)
else:
raise TypeError("Optimizer's parameters must be either "
"torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
"Received {}".format(param.type()))
old_add_param_group(new_group)
optimizer.add_param_group = types.MethodType(new_add_param_group, optimizer)
return optimizer
......@@ -70,7 +70,7 @@ class Properties(object):
if self.opt_level == "O1" and value is not None:
warn_or_err("With opt_level O1, batchnorm functions are automatically patched "
"to run in FP32, so keep_batchnorm_fp32 should be None." +
"keep_batchnorm_fp32 was {}".format(keep_batchnorm_fp32))
" keep_batchnorm_fp32 was {}".format(value))
if value == "False":
self.options[name] = False
elif value == "True":
......@@ -78,7 +78,7 @@ class Properties(object):
else:
assert (value is True or value is False or value is None),\
"keep_batchnorm_fp32 must be a boolean, the string 'True' or 'False', "\
"or None, found keep_batchnorm_fp32={}".format(keep_batchnorm_fp32)
"or None, found keep_batchnorm_fp32={}".format(value)
self.options[name] = value
elif name == "master_weights":
if self.opt_level == "O1" and value is not None:
......@@ -303,6 +303,10 @@ def initialize(
if not enabled:
return models, optimizers
if not torch.backends.cudnn.enabled:
raise RuntimeError(
"Amp requires torch.backends.cudnn.enabled = True")
if opt_level not in opt_levels:
raise RuntimeError(
"Unexpected optimization level {}. ".format(opt_level) +
......
......@@ -6,6 +6,7 @@ from . import utils
from .opt import OptimWrapper
from .scaler import LossScaler
from ._amp_state import _amp_state, master_params, maybe_print
from ..parallel.LARC import LARC
# There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
......@@ -73,11 +74,16 @@ def scale_loss(loss,
.. _`Advanced Amp Usage`:
https://nvidia.github.io/apex/advanced.html
"""
if not hasattr(_amp_state, "opt_properties"):
raise RuntimeError("Invoked 'with amp.scale_loss`, but internal Amp state has not been initialized. "
"model, optimizer = amp.initialize(model, optimizer, opt_level=...) must be called "
"before `with amp.scale_loss`.")
if not _amp_state.opt_properties.enabled:
yield loss
return
if isinstance(optimizers, torch.optim.Optimizer):
if isinstance(optimizers, torch.optim.Optimizer) or isinstance(optimizers, LARC):
optimizers = [optimizers]
loss_scaler = _amp_state.loss_scalers[loss_id]
......
......@@ -78,6 +78,7 @@ CASTS = [
'addcmul',
'atan2',
'cross',
'bilinear',
# Element-wise _or_ tensor-wise math
'add',
......
......@@ -65,6 +65,8 @@ def convert_network(network, dtype):
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
continue
convert_module(module, dtype)
if isinstance(module, torch.nn.RNNBase) or isinstance(module, torch.nn.modules.rnn.RNNBase):
module.flatten_parameters()
return network
......
......@@ -83,6 +83,7 @@ class FP16_Optimizer(object):
self.dynamic_loss_scale = False
self.cur_iter = 0
self.cur_scale = static_loss_scale
self.verbose = verbose
def zero_grad(self, set_grads_to_None=True):
"""
......@@ -173,8 +174,9 @@ class FP16_Optimizer(object):
def _update_scale(self, skip):
if self.dynamic_loss_scale:
if skip:
print("\nGrad overflow on iteration", self.cur_iter)
print("Using dynamic loss scale of", self.cur_scale)
if self.verbose:
print("\nGrad overflow on iteration", self.cur_iter)
print("Using dynamic loss scale of", self.cur_scale)
self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
self.last_overflow_iter = self.cur_iter
else:
......
......@@ -104,8 +104,6 @@ class Reducer(object):
When used with this launcher, :class:`Reducer` assumes 1:1 mapping of processes to GPUs.
It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.
main_reducer.py in https://github.com/NVIDIA/apex/tree/master/examples/imagenet shows example usage.
Args:
module_or_grads_list: Either a network definition (module) being run in multi-gpu/distributed mode, or an iterable of gradients to be reduced. If a module is passed in, the Reducer constructor will sync the parameters across processes (broadcasting from rank 0) to make sure they're all initialized with the same values. If a list of gradients (that came from some module) is passed in, the user is responsible for manually syncing that module's parameters at the beginning of training.
"""
......@@ -143,7 +141,7 @@ class DistributedDataParallel(Module):
When used with this launcher, :class:`DistributedDataParallel` assumes 1:1 mapping of processes to GPUs.
It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.
https://github.com/NVIDIA/apex/tree/master/examples/distributed shows detailed usage.
https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed shows detailed usage.
https://github.com/NVIDIA/apex/tree/master/examples/imagenet shows another example
that combines :class:`DistributedDataParallel` with mixed precision training.
......
......@@ -26,9 +26,6 @@ class SyncBatchnormFunction(Function):
count = int(input.numel()/input.size(1))
mean, var_biased = syncbn.welford_mean_var(input)
if count == 1:
raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size()))
if torch.distributed.is_initialized():
if not process_group:
process_group = torch.distributed.group.WORLD
......@@ -45,6 +42,9 @@ class SyncBatchnormFunction(Function):
inv_std = 1.0 / torch.sqrt(var_biased + eps)
var = var_biased * (count) / (count-1)
if count == 1 and world_size < 2:
raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size()))
r_m_inc = mean if running_mean.dtype != torch.float16 else mean.half()
r_v_inc = var if running_variance.dtype != torch.float16 else var.half()
running_mean.data = running_mean.data * (1-momentum) + momentum*r_m_inc
......
......@@ -66,10 +66,23 @@ class SyncBatchNorm(_BatchNorm):
torch.cuda.nvtx.range_push("sync_bn_fw_with_mean_var")
mean = None
var = None
cast = None
out = None
# casting to handle mismatch input type to layer type
if self.running_mean is not None:
if self.running_mean.dtype != input.dtype:
input = input.to(self.running_mean.dtype)
cast = input.dtype
elif self.weight is not None:
if self.weight.dtype != input.dtype:
input = input.to(self.weight.dtype)
cast = input.dtype
if not self.training and self.track_running_stats:
# fall back to pytorch implementation for inference
torch.cuda.nvtx.range_pop()
return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
out = F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
else:
process_group = self.process_group
world_size = 1
......@@ -114,4 +127,5 @@ class SyncBatchNorm(_BatchNorm):
(m-1) * self.momentum * var + \
(1 - self.momentum) * self.running_var
torch.cuda.nvtx.range_pop()
return SyncBatchnormFunction.apply(input, self.weight, self.bias, mean, var, self.eps, process_group, world_size)
out = SyncBatchnormFunction.apply(input, self.weight, self.bias, mean, var, self.eps, process_group, world_size)
out = out.to(cast)
# Simple examples of FP16_Optimizer functionality
To use `FP16_Optimizer` on a half-precision model, or a model with a mixture of
half and float parameters, only two lines of your training script need to change:
1. Construct an `FP16_Optimizer` instance from an existing optimizer.
2. Replace `loss.backward()` with `optimizer.backward(loss)`.
#### [Full API Documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
See "Other Options" at the bottom of this page for some cases that require special treatment.
#### Minimal Working Sample
`minimal.py` shows the basic usage of `FP16_Optimizer` with either static or dynamic loss scaling. Test via `python minimal.py`.
#### Closures
`FP16_Optimizer` supports closures with the same control flow as ordinary Pytorch optimizers.
`closure.py` shows an example. Test via `python closure.py`.
See [the API documentation](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.step) for more details.
#### Serialization/Deserialization
`FP16_Optimizer` supports saving and loading with the same control flow as ordinary Pytorch optimizers.
`save_load.py` shows an example. Test via `python save_load.py`.
See [the API documentation](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.load_state_dict) for more details.
#### Distributed
**distributed_apex** shows an example using `FP16_Optimizer` with Apex DistributedDataParallel.
The usage of `FP16_Optimizer` with distributed does not need to change from ordinary single-process
usage. Test via
```bash
cd distributed_apex
bash run.sh
```
**distributed_pytorch** shows an example using `FP16_Optimizer` with Pytorch DistributedDataParallel.
Again, the usage of `FP16_Optimizer` with distributed does not need to change from ordinary
single-process usage. Test via
```bash
cd distributed_pytorch
bash run.sh
```
#### Other Options
Gradient clipping requires that calls to `torch.nn.utils.clip_grad_norm`
be replaced with [fp16_optimizer_instance.clip_master_grads()](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.clip_master_grads). The [word_language_model example](https://github.com/NVIDIA/apex/blob/master/examples/word_language_model/main_fp16_optimizer.py) uses this feature.
Multiple losses will work if you simply replace
```bash
loss1.backward()
loss2.backward()
```
with
```bash
optimizer.backward(loss1)
optimizer.backward(loss2)
```
but `FP16_Optimizer` can be told to handle this more efficiently using the
[update_master_grads()](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.update_master_grads) option.
import torch
from apex.fp16_utils import FP16_Optimizer
torch.backends.cudnn.benchmark = True
N, D_in, D_out = 64, 1024, 16
x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
model = torch.nn.Linear(D_in, D_out).cuda().half()
optimizer = torch.optim.LBFGS(model.parameters())
### Construct FP16_Optimizer
optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0)
###
loss_fn = torch.nn.MSELoss()
for t in range(5):
def closure():
optimizer.zero_grad()
y_pred = model(x)
loss = loss_fn(y_pred.float(), y.float())
### Change loss.backward() within the closure to: ###
optimizer.backward(loss)
###
return loss
loss = optimizer.step(closure)
print("final loss = ", loss)
**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
`apex.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
single-process usage. Test via
```bash
bash run.sh
```
import torch
import argparse
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import FP16_Optimizer
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl',
init_method='env://')
torch.backends.cudnn.benchmark = True
N, D_in, D_out = 64, 1024, 16
x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
model = torch.nn.Linear(D_in, D_out).cuda().half()
model = DDP(model)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
### Construct FP16_Optimizer ###
optimizer = FP16_Optimizer(optimizer)
###
loss_fn = torch.nn.MSELoss()
for t in range(500):
optimizer.zero_grad()
y_pred = model(x)
loss = loss_fn(y_pred.float(), y.float())
### Change loss.backward() to: ###
optimizer.backward(loss)
###
optimizer.step()
print("final loss = ", loss)
#!/bin/bash
python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
`torch.nn.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
single-process usage. Test via
```bash
bash run.sh
```
import torch
import argparse
from apex.fp16_utils import FP16_Optimizer
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl',
init_method='env://')
torch.backends.cudnn.benchmark = True
N, D_in, D_out = 64, 1024, 16
x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
model = torch.nn.Linear(D_in, D_out).cuda().half()
model = torch.nn.parallel.DistributedDataParallel(model,
device_ids=[args.local_rank],
output_device=args.local_rank)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
### Construct FP16_Optimizer ###
optimizer = FP16_Optimizer(optimizer)
###
loss_fn = torch.nn.MSELoss()
for t in range(500):
optimizer.zero_grad()
y_pred = model(x)
loss = loss_fn(y_pred.float(), y.float())
### Change loss.backward() to: ###
optimizer.backward(loss)
###
optimizer.step()
print("final loss = ", loss)
#!/bin/bash
python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
import torch
from apex.fp16_utils import FP16_Optimizer
torch.backends.cudnn.benchmark = True
N, D_in, D_out = 64, 1024, 16
x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
model = torch.nn.Linear(D_in, D_out).cuda().half()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
### Construct FP16_Optimizer
### FP16_Optimizer will ingest and remember the original optimizer's param_groups.
###
### Construct with static loss scaling...
optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0)
### ...or dynamic loss scaling
# optimizer = FP16_Optimizer(optimizer,
# dynamic_loss_scale=True,
# dynamic_loss_args={'scale_factor' : 2})
### dynamic_loss_args is optional, for "power users," and unnecessary in most cases.
loss_fn = torch.nn.MSELoss()
for t in range(200):
optimizer.zero_grad()
y_pred = model(x)
loss = loss_fn(y_pred.float(), y.float())
### Change loss.backward() to:
optimizer.backward(loss)
###
optimizer.step()
print("final loss = ", loss)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment