Merging in latest master changes

c142714b · Michael Carilli · b620f96b · e6eec3ba · b620f96b · b620f96b
Commit c142714b authored May 23, 2019 by Michael Carilli
9 changed files
--- a/examples/deprecated_api/word_language_model/model.py
+++ b/examples/deprecated_api/word_language_model/model.py
-import torch.nn as nn
-class RNNModel(nn.Module):
-    """Container module with an encoder, a recurrent module, and a decoder."""
-    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
-        super(RNNModel, self).__init__()
-        self.drop = nn.Dropout(dropout)
-        self.encoder = nn.Embedding(ntoken, ninp)
-        if rnn_type in ['LSTM', 'GRU']:
-            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
-        else:
-            try:
-                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
-            except KeyError:
-                raise ValueError("""An invalid option for `--model` was supplied,
-                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
-            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
-        self.decoder = nn.Linear(nhid, ntoken)
-        # Optionally tie weights as in:
-        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
-        # https://arxiv.org/abs/1608.05859
-        # and
-        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
-        # https://arxiv.org/abs/1611.01462
-        if tie_weights:
-            if nhid != ninp:
-                raise ValueError('When using the tied flag, nhid must be equal to emsize')
-            self.decoder.weight = self.encoder.weight
-        self.init_weights()
-        self.rnn_type = rnn_type
-        self.nhid = nhid
-        self.nlayers = nlayers
-    def init_weights(self):
-        initrange = 0.1
-        self.encoder.weight.data.uniform_(-initrange, initrange)
-        self.decoder.bias.data.fill_(0)
-        self.decoder.weight.data.uniform_(-initrange, initrange)
-    def forward(self, input, hidden):
-        emb = self.drop(self.encoder(input))
-        output, hidden = self.rnn(emb, hidden)
-        output = self.drop(output)
-        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
-        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
-    def init_hidden(self, bsz):
-        weight = next(self.parameters()).data
-        if self.rnn_type == 'LSTM':
-            return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
-                    weight.new(self.nlayers, bsz, self.nhid).zero_())
-        else:
-            return weight.new(self.nlayers, bsz, self.nhid).zero_()
--- a/examples/deprecated_api/word_language_model/requirements.txt
+++ b/examples/deprecated_api/word_language_model/requirements.txt
-torch
--- a/examples/deprecated_api/docker/Dockerfile
+++ b/examples/deprecated_api/docker/Dockerfile
 # Base image must at least have pytorch and CUDA installed.
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:19.01-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:19.03-py3
 FROM $BASE_IMAGE
 ARG BASE_IMAGE
 RUN echo "Installing Apex on top of ${BASE_IMAGE}"
-WORKDIR /workspace
+# make sure we don't overwrite some existing directory called "apex"
-# uninstall Apex if present
+WORKDIR /tmp/unique_for_apex
+# uninstall Apex if present, twice to make absolutely sure :)
+RUN pip uninstall -y apex || :
 RUN pip uninstall -y apex || :
-# SHA is something the user can touch to force recreation of this Docker layer, 
+# SHA is something the user can touch to force recreation of this Docker layer,
 # and therefore force cloning of the latest version of Apex
 RUN SHA=ToUcHMe git clone https://github.com/NVIDIA/apex.git
-WORKDIR /workspace/apex
+WORKDIR /tmp/unique_for_apex/apex
-RUN python setup.py install --cuda_ext --cpp_ext
+RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
 WORKDIR /workspace
--- a/examples/deprecated_api/docker/README.md
+++ b/examples/deprecated_api/docker/README.md
@@ -2,22 +2,31 @@
 **Dockerfile** installs the latest Apex on top of an existing image.  Run
 ```
-docker build -t image_with_apex .
+docker build -t new_image_with_apex .
 ```
 By default, **Dockerfile** uses NVIDIA's Pytorch container as the base image,
 which requires an NVIDIA GPU Cloud (NGC) account.  If you don't have an NGC account, you can sign up for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key).
 Alternatively, you can supply your own base image via the `BASE_IMAGE` build-arg.
-Any `BASE_IMAGE` you supply must have Pytorch and Cuda installed, for example:
+`BASE_IMAGE` must have Pytorch and Cuda installed.  For example, any
+`-devel` image for Pytorch 1.0 and later from the
+[official Pytorch Dockerhub](https://hub.docker.com/r/pytorch/pytorch) may be used:
 ```
-docker build --build-arg BASE_IMAGE=pytorch/pytorch:0.4-cuda9-cudnn7-devel -t image_with_apex .
+docker build --build-arg BASE_IMAGE=pytorch/pytorch:nightly-devel-cuda10.0-cudnn7 -t new_image_with_apex .
 ```
 If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable in **Dockerfile**.
 **Warning:**
-Currently, Pytorch's default non-devel image on Dockerhub
+Currently, the non-`-devel` images on Pytorch Dockerhub do not contain the Cuda compiler `nvcc`.  Therefore,
-[pytorch/pytorch:0.4_cuda9_cudnn7](https://hub.docker.com/r/pytorch/pytorch/tags/) contains Pytorch installed with prebuilt binaries.  It does not contain NVCC, which means it is not an eligible candidate for `<base image>`.
+images whose name does not contain `-devel` are not eligible candidates for `BASE_IMAGE`.
+### Running your Apex container
+Like any Cuda-enabled Pytorch container, a container with Apex should be run via [nvidia-docker](https://github.com/NVIDIA/nvidia-docker), for example:
+```
+docker run --runtime=nvidia -it --rm --ipc=host new_image_with_apex
+```
 ## Option 2:  Install Apex in a running container
@@ -25,4 +34,7 @@ Instead of building a new container, it is also a viable option to `git clone ht
 ```
 docker run --runtime=nvidia -it --rm --ipc=host -v /bare/metal/apex:/apex/in/container <base image>
 ```
-then go to /apex/in/container within the running container and `python setup.py install [--cuda_ext] [--cpp_ext]`.
+then go to /apex/in/container within the running container and
+```
+pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+```
--- a/examples/imagenet/main_amp.py
+++ b/examples/imagenet/main_amp.py
@@ -82,7 +82,6 @@ def fast_collate(batch):
    tensor = torch.zeros( (len(imgs), 3, h, w), dtype=torch.uint8 )
    for i, img in enumerate(imgs):
        nump_array = np.asarray(img, dtype=np.uint8)
-        tens = torch.from_numpy(nump_array)
        if(nump_array.ndim < 3):
            nump_array = np.expand_dims(nump_array, axis=-1)
        nump_array = np.rollaxis(nump_array, 2)
@@ -190,8 +189,9 @@ def main():
    valdir = os.path.join(args.data, 'val')
    if(args.arch == "inception_v3"):
-        crop_size = 299
+        raise RuntimeError("Currently, inception_v3 is not supported by this example.")
-        val_size = 320 # I chose this value arbitrarily, we can adjust.
+        # crop_size = 299
+        # val_size = 320 # I chose this value arbitrarily, we can adjust.
    else:
        crop_size = 224
        val_size = 256

--- a/setup.py
+++ b/setup.py
@@ -47,10 +47,12 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
    print(raw_output + "from " + cuda_dir + "/bin\n")
    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
-        # TODO:  make this a hard error?
+        raise RuntimeError("Cuda extensions are being compiled with a version of Cuda that does " +
-        print("\nWarning:  Cuda extensions are being compiled with a version of Cuda that does "
+                           "not match the version used to compile Pytorch binaries.  " +
-              "not match the version used to compile Pytorch binaries.\n")
+                           "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) +
-    print("Pytorch binaries were compiled with Cuda {}\n".format(torch.version.cuda))
+                           "In some cases, a minor-version mismatch will not cause later errors:  " +
+                           "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
+                           "You can try commenting out this check (at your own risk).")
 if "--cuda_ext" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension

--- a/tests/L0/run_amp/test_add_param_group.py
+++ b/tests/L0/run_amp/test_add_param_group.py
+import unittest
+import functools as ft
+import itertools as it
+from apex import amp
+from apex.amp import _amp_state
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import Parameter
+from utils import common_init, HALF, FLOAT,\
+    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
+class MyModel(torch.nn.Module):
+    def __init__(self, unique):
+        super(MyModel, self).__init__()
+        self.weight0 = Parameter(unique +
+            torch.arange(2, device='cuda', dtype=torch.float32))
+        self.weight1 = Parameter(1. + unique + torch.arange(2, device='cuda', dtype=torch.float16))
+    @staticmethod
+    def ops(input, weight0, weight1):
+        return ((input*(weight0.float()))*(weight1.float())).sum()
+    def forward(self, input):
+        return self.ops(input, self.weight0, self.weight1)
+# Abandon all hope, ye who enter here.
+class TestAddParamGroup(unittest.TestCase):
+    def setUp(self):
+        self.x = torch.ones((2), device='cuda', dtype=torch.float32)
+        common_init(self)
+    def tearDown(self):
+        pass
+    def zero_grad(self, models, optimizer, how_to_zero):
+        if how_to_zero == "none":
+            for model in models:
+                for param in model.parameters():
+                    param.grad = None
+        elif how_to_zero == "model":
+            for model in models:
+                model.zero_grad()
+        elif how_to_zero == "optimizer":
+            optimizer.zero_grad()
+    def test_add_param_group(self):
+        for opt_level in ("O0", "O1", "O2", "O3"):
+          for zero_before_add in (True, False):
+            for try_accumulation in (True, False):
+              model0 = MyModel(1)
+              model1 = MyModel(2)
+              optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
+                                          momentum=0.125)
+              optimizer.zero_grad()
+              loss = model0(self.x)
+              loss.backward()
+              optimizer.step()
+              if zero_before_add:
+                  optimizer.zero_grad()
+              optimizer.add_param_group({'params' : model1.parameters(), 'lr' : 0.5})
+              if not zero_before_add:
+                  optimizer.zero_grad()
+              loss = model0(self.x) + model1(self.x)
+              loss.backward(retain_graph=try_accumulation)
+              if try_accumulation:
+                  loss.backward()
+              optimizer.step()
+              # Once more to make sure the new params pick up momemtums properly
+              optimizer.zero_grad()
+              loss = model0(self.x) + model1(self.x)
+              loss.backward(retain_graph=try_accumulation)
+              if try_accumulation:
+                  loss.backward()
+              optimizer.step()
+              reference_params = [param.data.clone() for param in model0.parameters()] + \
+                                 [param.data.clone() for param in model1.parameters()]
+              for how_to_zero in "none", "model", "optimizer":
+                  model0 = MyModel(1)
+                  model1 = MyModel(2)
+                  optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
+                                              momentum=0.125)
+                  _amp_state.allow_incoming_model_not_fp32 = True
+                  [model0, model1], optimizer = amp.initialize([model0, model1],
+                      optimizer,
+                      opt_level=opt_level,
+                      verbosity=0,
+                      cast_model_type=False)
+                  _amp_state.allow_incoming_model_not_fp32 = False
+                  _amp_state.loss_scalers[0]._loss_scale = 4.0
+                  self.zero_grad([model0, model1], optimizer, how_to_zero)
+                  loss = model0(self.x)
+                  with amp.scale_loss(loss, optimizer) as scaled_loss:
+                      scaled_loss.backward()
+                  optimizer.step()
+                  if zero_before_add:
+                      self.zero_grad([model0, model1], optimizer, how_to_zero)
+                  optimizer.add_param_group({'params' : model1.parameters(), 'lr' : 0.5})
+                  if not zero_before_add:
+                      self.zero_grad([model0, model1], optimizer, how_to_zero)
+                  loss = model0(self.x) + model1(self.x)
+                  with amp.scale_loss(loss, optimizer) as scaled_loss:
+                      scaled_loss.backward(retain_graph=try_accumulation)
+                  if try_accumulation:
+                      with amp.scale_loss(loss, optimizer) as scaled_loss:
+                          scaled_loss.backward()
+                  optimizer.step()
+                  # Once more to make sure the new params pick up momentums properly
+                  self.zero_grad([model0, model1], optimizer, how_to_zero)
+                  loss = model0(self.x) + model1(self.x)
+                  with amp.scale_loss(loss, optimizer) as scaled_loss:
+                      scaled_loss.backward(retain_graph=try_accumulation)
+                  if try_accumulation:
+                      with amp.scale_loss(loss, optimizer) as scaled_loss:
+                          scaled_loss.backward()
+                  optimizer.step()
+                  final_params = [param.data.clone() for param in model0.parameters()] + \
+                                 [param.data.clone() for param in model1.parameters()]
+                  for reference, final in zip(reference_params, final_params):
+                      self.assertTrue(torch.allclose(reference.to(final.dtype), final),
+                                      "opt_level = {}, how_to_zero = {}, zero_before_add = {}".format(
+                                      opt_level, how_to_zero, zero_before_add))
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/L1/common/main_amp.py
+++ b/tests/L1/common/main_amp.py
@@ -86,7 +86,6 @@ def fast_collate(batch):
    tensor = torch.zeros( (len(imgs), 3, h, w), dtype=torch.uint8 )
    for i, img in enumerate(imgs):
        nump_array = np.asarray(img, dtype=np.uint8)
-        tens = torch.from_numpy(nump_array)
        if(nump_array.ndim < 3):
            nump_array = np.expand_dims(nump_array, axis=-1)
        nump_array = np.rollaxis(nump_array, 2)

--- a/tests/docker_extension_builds/run.sh
+++ b/tests/docker_extension_builds/run.sh
@@ -16,6 +16,7 @@ images=(
 "gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.03-py3-devel"
 "gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
 "pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
+"pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel"
 "pytorch/pytorch:1.0.1-cuda10.0-cudnn7-devel"
 "pytorch/pytorch:1.0-cuda10.0-cudnn7-devel"
 "pytorch/pytorch:nightly-devel-cuda9.2-cudnn7"