Unverified Commit 5b8faa29 authored by mcarilli's avatar mcarilli Committed by GitHub
Browse files

Check cuda version (#216)

* Adding Torch + bare-metal nvcc version check and container build tests

* Putting a canary in the coalmine

* canary proved elusive

* Trying direct setup.py install

* this should work

* Removing canary

* hopefully this works
parent 6e5d9099
import torch import torch
from setuptools import setup, find_packages from setuptools import setup, find_packages
import subprocess
import sys import sys
if not torch.cuda.is_available(): if not torch.cuda.is_available():
print("Warning: Torch did not find available GPUs on this system.\n", print("\nWarning: Torch did not find available GPUs on this system.\n",
"If your intention is to cross-compile, this is not an error.") "If your intention is to cross-compile, this is not an error.\n")
print("torch.__version__ = ", torch.__version__) print("torch.__version__ = ", torch.__version__)
TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MAJOR = int(torch.__version__.split('.')[0])
...@@ -32,6 +33,25 @@ if "--cpp_ext" in sys.argv: ...@@ -32,6 +33,25 @@ if "--cpp_ext" in sys.argv:
CppExtension('apex_C', CppExtension('apex_C',
['csrc/flatten_unflatten.cpp',])) ['csrc/flatten_unflatten.cpp',]))
def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
output = raw_output.split()
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = release[0]
bare_metal_minor = release[1][0]
torch_binary_major = torch.version.cuda.split(".")[0]
torch_binary_minor = torch.version.cuda.split(".")[1]
print("\nCompiling cuda extensions with")
print(raw_output + "from " + cuda_dir + "/bin\n")
if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
# TODO: make this a hard error?
print("\nWarning: Cuda extensions are being compiled with a version of Cuda that does "
"not match the version used to compile Pytorch binaries.\n")
print("Pytorch binaries were compiled with Cuda {}\n".format(torch.version.cuda))
if "--cuda_ext" in sys.argv: if "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension from torch.utils.cpp_extension import CUDAExtension
sys.argv.remove("--cuda_ext") sys.argv.remove("--cuda_ext")
...@@ -39,6 +59,8 @@ if "--cuda_ext" in sys.argv: ...@@ -39,6 +59,8 @@ if "--cuda_ext" in sys.argv:
if torch.utils.cpp_extension.CUDA_HOME is None: if torch.utils.cpp_extension.CUDA_HOME is None:
raise RuntimeError("--cuda_ext was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") raise RuntimeError("--cuda_ext was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
else: else:
check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
# Set up macros for forward/backward compatibility hack around # Set up macros for forward/backward compatibility hack around
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e # https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
version_ge_1_1 = [] version_ge_1_1 = []
......
#!/bin/bash
print_banner() {
printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
}
print_green() {
printf "\e[30m\e[42m$1\e[0m\n"
}
print_red() {
printf "\e[30m\e[41m$1\e[0m\n"
}
images=(
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.03-py3-devel"
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
"pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
"pytorch/pytorch:1.0.1-cuda10.0-cudnn7-devel"
"pytorch/pytorch:1.0-cuda10.0-cudnn7-devel"
"pytorch/pytorch:nightly-devel-cuda9.2-cudnn7"
)
branch="master"
# Associative array for exit codes
declare -A exit_codes
for image in images
do
exit_codes[$image]="None"
done
for image in "${images[@]}"
do
print_banner "$image"
set -x
docker pull $image
# Trying python setup.py install instead of pip install to ensure direct access to error codes.
# Maybe pip install would be ok too but this works.
docker run --runtime=nvidia --rm $image /bin/bash -c "yes | pip uninstall apex; yes | pip uninstall apex; git clone https://github.com/NVIDIA/apex.git; cd apex; git checkout $branch; set -e; python setup.py install --cuda_ext --cpp_ext"
exit_code=$?
set +x
if [ $exit_code != 0 ]
then
print_red "Exit code: $exit_code"
else
print_green "Exit code: $exit_code"
fi
exit_codes[$image]=$exit_code
done
success=0
for image in "${images[@]}"
do
exit_code=${exit_codes[$image]}
if [ $exit_code != 0 ]
then
print_red "$image : $exit_code"
success=1
else
print_green "$image : $exit_code"
fi
done
if [ $success != 0 ]
then
print_red "Overall status: failure"
else
print_green "Overall status: success"
fi
exit $success
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment