Updating examples

5e54253f · Michael Carilli · 1d2094a1 · 5e54253f · 5e54253f · 5e54253f
Commit 5e54253f authored Jun 29, 2018 by Michael Carilli
8 changed files
--- a/apex/fp16_utils/__init__.py
+++ b/apex/fp16_utils/__init__.py
@@ -6,6 +6,7 @@ from .fp16util import (
    master_params_to_model_params, 
    tofp16,
    to_python_float,
+    clip_grad_norm,
 )



--- a/apex/fp16_utils/fp16_optimizer.py
+++ b/apex/fp16_utils/fp16_optimizer.py
@@ -5,7 +5,7 @@ from torch.nn.parameter import Parameter
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

 from .loss_scaler import DynamicLossScaler, LossScaler
-from .fp16util import model_grads_to_master_grads, master_params_to_model_params
+from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm

 FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
 HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
@@ -199,12 +199,7 @@ class FP16_Optimizer(object):
        self.overflow = False
        self.first_closure_call_this_step = True

-        TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        TORCH_MINOR = int(torch.__version__.split('.')[1])
-        if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
-            self.clip_grad_norm = torch.nn.utils.clip_grad_norm
-        else:
-            self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
+        self.clip_grad_norm = clip_grad_norm
            
    def __getstate__(self):
        raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")

--- a/apex/fp16_utils/fp16util.py
+++ b/apex/fp16_utils/fp16util.py
@@ -128,9 +128,17 @@ def master_params_to_model_params(model_params, master_params, flat_master=False
        for model, master in zip(model_params, master_params):
            model.data.copy_(master.data)

-# item() is a recent addition, so this helps with backward compatibility.
+# Backward compatibility fixes
+
 def to_python_float(t):
    if hasattr(t, 'item'):
        return t.item()
    else:
        return t[0]
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
+    clip_grad_norm = torch.nn.utils.clip_grad_norm
+else:
+    clip_grad_norm = torch.nn.utils.clip_grad_norm_
--- a/examples/docker/Dockerfile
+++ b/examples/docker/Dockerfile
 # Base image must at least have pytorch and CUDA installed.
-FROM <base image>
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:18.04-py3
+FROM $BASE_IMAGE
+ARG BASE_IMAGE
+RUN echo "Installing Apex on top of ${BASE_IMAGE}"
 WORKDIR /workspace
 # uninstall Apex if present
 RUN pip uninstall -y apex || :

--- a/examples/docker/README.md
+++ b/examples/docker/README.md
-**Dockerfile** is a simple template that shows how to install the latest Apex on top of an existing image.  Edit **Dockerfile** to choose a base image, then run 
+## Create new container with Apex
+
+**Dockerfile** installs the latest Apex on top of an existing image.  Run
+```
+docker build -t image_with_apex .
+```
+By default, **Dockerfile** uses NVIDIA's Pytorch container as the base image,
+which requires an NVIDIA GPU Cloud (NGC) account.  If you don't have an NGC account, you can sign up for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key).
+
+Alternatively, you can supply your own base image via the `BASE_IMAGE` build-arg.
+Any `BASE_IMAGE` you supply must have Pytorch and Cuda installed, for example:
 ```
-docker build -t image_with_apex ."
+docker build --build-arg BASE_IMAGE=pytorch/pytorch:0.4-cuda9-cudnn7-devel -t image_with_apex .
 ```
-.  If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable on line 8.

-**base_images.md** provides guidance on base images to use in the `FROM <base image>` line of **Dockerfile**.
+If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable in **Dockerfile**.
+
+**Warning:**
+Currently, Pytorch's default non-devel image on Dockerhub
+[pytorch/pytorch:0.4_cuda9_cudnn7](https://hub.docker.com/r/pytorch/pytorch/tags/) contains Pytorch installed with prebuilt binaries.  It does not contain NVCC, which means it is not an eligible candidate for `<base image>`.
+
+## Install Apex in running container

 Instead of building a new container, it is also a viable option to clone Apex on bare metal, mount the Apex repo into your container at launch by running, for example,
 ```
 docker run --runtime=nvidia -it --rm --ipc=host -v /bare/metal/apex:/apex/in/container <base image>
 ```
-, then go to /apex/in/container within the running container and `python setup.py install`.
+then go to /apex/in/container within the running container and `python setup.py install`.
--- a/examples/docker/base_images.md
+++ b/examples/docker/base_images.md
-When specifying 
-```
-FROM <base image>
-```
-in **Dockerfile**, `<base image>` must have Pytorch and CUDA installed.
-
-If you have an NGC account, you can use Nvidia's official Pytorch container
-```
-nvcr.io/nvidia/pytorch:18.04-py3
-```
-as `<base image>`.
-If you don't have an NGC account, you can sign up for one for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key).
-
-An alternative is to first 
-[build a local Pytorch image](https://github.com/pytorch/pytorch#docker-image) using Pytorch's Dockerfile on Github. From the root of your cloned Pytorch repo,
-run
-```
-docker build -t my_pytorch_image -f docker/pytorch/Dockerfile .
-```
-`my_pytorch_image` will contain CUDA, and can be used as `<base image>`.
-
-**Warning:**
-Currently, Pytorch's latest stable image on Dockerhub
-[pytorch/pytorch:0.4_cuda9_cudnn7](https://hub.docker.com/r/pytorch/pytorch/tags/) contains Pytorch installed with prebuilt binaries.  It does not contain NVCC, which means it is not an eligible candidate for `<base image>`.
--- a/examples/word_language_model/main.py
+++ b/examples/word_language_model/main.py
@@ -177,7 +177,10 @@ def train():
        loss.backward()
        loss = loss / args.static_loss_scale
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
-        torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
+        # apex.fp16_utils.clip_grad_norm selects between "torch.nn.utils.clip_grad_norm" 
+        # and "torch.nn.utils.clip_grad_norm_" based on Pytorch version.  
+        # It's not FP16-specific, just a small fix to avoid deprecation warnings.
+        clip_grad_norm(model.parameters(), args.clip)

        if args.fp16 and args.cuda:
            model_grads_to_master_grads(model_params, master_params)

--- a/examples/word_language_model/main_fp16_optimizer.py
+++ b/examples/word_language_model/main_fp16_optimizer.py
@@ -195,7 +195,11 @@ def train():
            optimizer.clip_master_grads(args.clip)
        else:
            loss.backward()
-            torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
+            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
+            # apex.fp16_utils.clip_grad_norm selects between "torch.nn.utils.clip_grad_norm" 
+            # and "torch.nn.utils.clip_grad_norm_" based on Pytorch version.  
+            # It's not FP16-specific, just a small fix to avoid deprecation warnings.
+            clip_grad_norm(model.parameters(), args.clip)

        optimizer.step()