Commit 5e54253f authored by Michael Carilli's avatar Michael Carilli
Browse files

Updating examples

parent 1d2094a1
...@@ -6,6 +6,7 @@ from .fp16util import ( ...@@ -6,6 +6,7 @@ from .fp16util import (
master_params_to_model_params, master_params_to_model_params,
tofp16, tofp16,
to_python_float, to_python_float,
clip_grad_norm,
) )
......
...@@ -5,7 +5,7 @@ from torch.nn.parameter import Parameter ...@@ -5,7 +5,7 @@ from torch.nn.parameter import Parameter
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from .loss_scaler import DynamicLossScaler, LossScaler from .loss_scaler import DynamicLossScaler, LossScaler
from .fp16util import model_grads_to_master_grads, master_params_to_model_params from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
...@@ -199,12 +199,7 @@ class FP16_Optimizer(object): ...@@ -199,12 +199,7 @@ class FP16_Optimizer(object):
self.overflow = False self.overflow = False
self.first_closure_call_this_step = True self.first_closure_call_this_step = True
TORCH_MAJOR = int(torch.__version__.split('.')[0]) self.clip_grad_norm = clip_grad_norm
TORCH_MINOR = int(torch.__version__.split('.')[1])
if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
self.clip_grad_norm = torch.nn.utils.clip_grad_norm
else:
self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
def __getstate__(self): def __getstate__(self):
raise RuntimeError("FP16_Optimizer should be serialized using state_dict().") raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
......
...@@ -128,9 +128,17 @@ def master_params_to_model_params(model_params, master_params, flat_master=False ...@@ -128,9 +128,17 @@ def master_params_to_model_params(model_params, master_params, flat_master=False
for model, master in zip(model_params, master_params): for model, master in zip(model_params, master_params):
model.data.copy_(master.data) model.data.copy_(master.data)
# item() is a recent addition, so this helps with backward compatibility. # Backward compatibility fixes
def to_python_float(t): def to_python_float(t):
if hasattr(t, 'item'): if hasattr(t, 'item'):
return t.item() return t.item()
else: else:
return t[0] return t[0]
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
clip_grad_norm = torch.nn.utils.clip_grad_norm
else:
clip_grad_norm = torch.nn.utils.clip_grad_norm_
# Base image must at least have pytorch and CUDA installed. # Base image must at least have pytorch and CUDA installed.
FROM <base image> ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:18.04-py3
FROM $BASE_IMAGE
ARG BASE_IMAGE
RUN echo "Installing Apex on top of ${BASE_IMAGE}"
WORKDIR /workspace WORKDIR /workspace
# uninstall Apex if present # uninstall Apex if present
RUN pip uninstall -y apex || : RUN pip uninstall -y apex || :
......
**Dockerfile** is a simple template that shows how to install the latest Apex on top of an existing image. Edit **Dockerfile** to choose a base image, then run ## Create new container with Apex
**Dockerfile** installs the latest Apex on top of an existing image. Run
```
docker build -t image_with_apex .
```
By default, **Dockerfile** uses NVIDIA's Pytorch container as the base image,
which requires an NVIDIA GPU Cloud (NGC) account. If you don't have an NGC account, you can sign up for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key).
Alternatively, you can supply your own base image via the `BASE_IMAGE` build-arg.
Any `BASE_IMAGE` you supply must have Pytorch and Cuda installed, for example:
``` ```
docker build -t image_with_apex ." docker build --build-arg BASE_IMAGE=pytorch/pytorch:0.4-cuda9-cudnn7-devel -t image_with_apex .
``` ```
. If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable on line 8.
**base_images.md** provides guidance on base images to use in the `FROM <base image>` line of **Dockerfile**. If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable in **Dockerfile**.
**Warning:**
Currently, Pytorch's default non-devel image on Dockerhub
[pytorch/pytorch:0.4_cuda9_cudnn7](https://hub.docker.com/r/pytorch/pytorch/tags/) contains Pytorch installed with prebuilt binaries. It does not contain NVCC, which means it is not an eligible candidate for `<base image>`.
## Install Apex in running container
Instead of building a new container, it is also a viable option to clone Apex on bare metal, mount the Apex repo into your container at launch by running, for example, Instead of building a new container, it is also a viable option to clone Apex on bare metal, mount the Apex repo into your container at launch by running, for example,
``` ```
docker run --runtime=nvidia -it --rm --ipc=host -v /bare/metal/apex:/apex/in/container <base image> docker run --runtime=nvidia -it --rm --ipc=host -v /bare/metal/apex:/apex/in/container <base image>
``` ```
, then go to /apex/in/container within the running container and `python setup.py install`. then go to /apex/in/container within the running container and `python setup.py install`.
When specifying
```
FROM <base image>
```
in **Dockerfile**, `<base image>` must have Pytorch and CUDA installed.
If you have an NGC account, you can use Nvidia's official Pytorch container
```
nvcr.io/nvidia/pytorch:18.04-py3
```
as `<base image>`.
If you don't have an NGC account, you can sign up for one for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key).
An alternative is to first
[build a local Pytorch image](https://github.com/pytorch/pytorch#docker-image) using Pytorch's Dockerfile on Github. From the root of your cloned Pytorch repo,
run
```
docker build -t my_pytorch_image -f docker/pytorch/Dockerfile .
```
`my_pytorch_image` will contain CUDA, and can be used as `<base image>`.
**Warning:**
Currently, Pytorch's latest stable image on Dockerhub
[pytorch/pytorch:0.4_cuda9_cudnn7](https://hub.docker.com/r/pytorch/pytorch/tags/) contains Pytorch installed with prebuilt binaries. It does not contain NVCC, which means it is not an eligible candidate for `<base image>`.
...@@ -177,7 +177,10 @@ def train(): ...@@ -177,7 +177,10 @@ def train():
loss.backward() loss.backward()
loss = loss / args.static_loss_scale loss = loss / args.static_loss_scale
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) # apex.fp16_utils.clip_grad_norm selects between "torch.nn.utils.clip_grad_norm"
# and "torch.nn.utils.clip_grad_norm_" based on Pytorch version.
# It's not FP16-specific, just a small fix to avoid deprecation warnings.
clip_grad_norm(model.parameters(), args.clip)
if args.fp16 and args.cuda: if args.fp16 and args.cuda:
model_grads_to_master_grads(model_params, master_params) model_grads_to_master_grads(model_params, master_params)
......
...@@ -195,7 +195,11 @@ def train(): ...@@ -195,7 +195,11 @@ def train():
optimizer.clip_master_grads(args.clip) optimizer.clip_master_grads(args.clip)
else: else:
loss.backward() loss.backward()
torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
# apex.fp16_utils.clip_grad_norm selects between "torch.nn.utils.clip_grad_norm"
# and "torch.nn.utils.clip_grad_norm_" based on Pytorch version.
# It's not FP16-specific, just a small fix to avoid deprecation warnings.
clip_grad_norm(model.parameters(), args.clip)
optimizer.step() optimizer.step()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment