Check cuda version (#216)

* Adding Torch + bare-metal nvcc version check and container build tests * Putting a canary in the coalmine * canary proved elusive * Trying direct setup.py install * this should work * Removing canary * hopefully this works

Check cuda version (#216)
* Adding Torch + bare-metal nvcc version check and container build tests * Putting a canary in the coalmine * canary proved elusive * Trying direct setup.py install * this should work * Removing canary * hopefully this works
5b8faa29 · mcarilli · GitHub · 6e5d9099 · 5b8faa29 · 5b8faa29
Unverified Commit 5b8faa29 authored Mar 21, 2019 by mcarilli Committed by GitHub Mar 21, 2019
Show whitespace changes
Inline Side-by-side

Showing with 98 additions and 4 deletions

setup.py setup.py +26 -4

tests/docker_extension_builds/run.sh tests/docker_extension_builds/run.sh +72 -0

No files found.
--- a/setup.py
+++ b/setup.py
 import torch
 from setuptools import setup, find_packages
+import subprocess
 import sys
 if not torch.cuda.is_available():
-    print("Warning: Torch did not find available GPUs on this system.\n",
+    print("\nWarning: Torch did not find available GPUs on this system.\n",
-          "If your intention is to cross-compile, this is not an error.")
+          "If your intention is to cross-compile, this is not an error.\n")
 print("torch.__version__  = ", torch.__version__)
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -32,6 +33,25 @@ if "--cpp_ext" in sys.argv:
        CppExtension('apex_C',
                     ['csrc/flatten_unflatten.cpp',]))
+def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+    torch_binary_major = torch.version.cuda.split(".")[0]
+    torch_binary_minor = torch.version.cuda.split(".")[1]
+    print("\nCompiling cuda extensions with")
+    print(raw_output + "from " + cuda_dir + "/bin\n")
+    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+        # TODO:  make this a hard error?
+        print("\nWarning:  Cuda extensions are being compiled with a version of Cuda that does "
+              "not match the version used to compile Pytorch binaries.\n")
+    print("Pytorch binaries were compiled with Cuda {}\n".format(torch.version.cuda))
 if "--cuda_ext" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--cuda_ext")
@@ -39,6 +59,8 @@ if "--cuda_ext" in sys.argv:
    if torch.utils.cpp_extension.CUDA_HOME is None:
        raise RuntimeError("--cuda_ext was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
    else:
+        check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
        # Set up macros for forward/backward compatibility hack around
        # https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
        version_ge_1_1 = []

--- a/tests/docker_extension_builds/run.sh
+++ b/tests/docker_extension_builds/run.sh
+#!/bin/bash
+print_banner() {
+  printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
+}
+print_green() {
+  printf "\e[30m\e[42m$1\e[0m\n"
+}
+print_red() {
+  printf "\e[30m\e[41m$1\e[0m\n"
+}
+images=(
+"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.03-py3-devel"
+"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
+"pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
+"pytorch/pytorch:1.0.1-cuda10.0-cudnn7-devel"
+"pytorch/pytorch:1.0-cuda10.0-cudnn7-devel"
+"pytorch/pytorch:nightly-devel-cuda9.2-cudnn7"
+)
+branch="master"
+# Associative array for exit codes
+declare -A exit_codes
+for image in images
+do
+  exit_codes[$image]="None"
+done
+for image in "${images[@]}"
+do
+  print_banner "$image"
+  set -x
+  docker pull $image
+  # Trying python setup.py install instead of pip install to ensure direct access to error codes.
+  # Maybe pip install would be ok too but this works.
+  docker run --runtime=nvidia --rm $image /bin/bash -c "yes | pip uninstall apex; yes | pip uninstall apex; git clone https://github.com/NVIDIA/apex.git; cd apex; git checkout $branch; set -e;  python setup.py install --cuda_ext --cpp_ext"
+  exit_code=$?
+  set +x
+  if [ $exit_code != 0 ]
+  then
+    print_red "Exit code: $exit_code"
+  else
+    print_green "Exit code: $exit_code"
+  fi
+  exit_codes[$image]=$exit_code
+done
+success=0
+for image in "${images[@]}"
+do
+  exit_code=${exit_codes[$image]}
+  if [ $exit_code != 0 ]
+  then
+    print_red "$image : $exit_code"
+    success=1
+  else
+    print_green "$image : $exit_code"
+  fi
+done
+if [ $success != 0 ]
+then
+  print_red "Overall status:  failure"
+else
+  print_green "Overall status:  success"
+fi
+exit $success