Unverified Commit 932ca5a3 authored by Yi Zhang's avatar Yi Zhang Committed by GitHub
Browse files

enable windows cuda unit tests (#4421)



* enable cuda for unittest

* add one check

* rename

* minor fix

* add more check

* add cuda envs

* add more cuda envs

* add driver update

* update vs integration

* get cuda version

* merge

* add envs in install.sh

* reduce some change and add comments

* minor improve

* fix typo

* exit if driver update failed

* fix lint

* fix lint

* Avoid catching exception to show error message

* try 11.1

* try 11.1

* "try 11.1"

* set the downloaded gpu driver dlls only for 10.2

* check torch.cuda.is_available()

* add display.driver

* Revert "Avoid catching exception to show error message"

This reverts commit 1ce58c50de9fefcc3438d29ff398b941f9b6a757.

* Update .circleci/unittest/windows/scripts/set_cuda_envs.sh

* Update .circleci/unittest/windows/scripts/set_cuda_envs.sh

* Update .circleci/unittest/windows/scripts/set_cuda_envs.sh
Co-authored-by: default avatarNicolas Hug <nicolashug@fb.com>
Co-authored-by: default avatarVasilis Vryniotis <datumbox@users.noreply.github.com>
Co-authored-by: default avatarNikita Shulga <nikita.shulga@gmail.com>
Co-authored-by: default avatarNikita Shulga <nshulga@fb.com>
parent eb2fb253
......@@ -763,6 +763,12 @@ jobs:
paths:
- conda
- env
- run:
name: Install CUDA
command: packaging/windows/internal/cuda_install.bat
- run:
name: Update CUDA driver
command: packaging/windows/internal/driver_update.bat
- run:
name: Install torchvision
command: .circleci/unittest/windows/scripts/install.sh
......
......@@ -763,6 +763,12 @@ jobs:
paths:
- conda
- env
- run:
name: Install CUDA
command: packaging/windows/internal/cuda_install.bat
- run:
name: Update CUDA driver
command: packaging/windows/internal/driver_update.bat
- run:
name: Install torchvision
command: .circleci/unittest/windows/scripts/install.sh
......
......@@ -5,7 +5,7 @@ unset PYTORCH_VERSION
# so no need to set PYTORCH_VERSION.
# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
set -e
set -ex
this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
......@@ -35,5 +35,17 @@ if [ $PYTHON_VERSION == "3.6" ]; then
pip install pillow>=5.3.0
fi
torch_cuda=$(python -c "import torch; print(torch.cuda.is_available())")
echo torch.cuda.is_available is $torch_cuda
if [ ! -z "${CUDA_VERSION:-}" ] ; then
if [ "$torch_cuda" == "False" ]; then
echo "torch with cuda installed but torch.cuda.is_available() is False"
exit 1
fi
fi
source "$this_dir/set_cuda_envs.sh"
printf "* Installing torchvision\n"
"$this_dir/vc_env_helper.bat" python setup.py develop
......@@ -5,6 +5,9 @@ set -e
eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
conda activate ./env
this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
source "$this_dir/set_cuda_envs.sh"
export PYTORCH_TEST_WITH_SLOW='1'
python -m torch.utils.collect_env
pytest --cov=torchvision --junitxml=test-results/junit.xml -v --durations 20 test --ignore=test/test_datasets_download.py
#!/usr/bin/env bash
set -ex
if [ "${CU_VERSION:-}" == "cpu" ] ; then
exit 0
fi
echo CU_VERSION is "${CU_VERSION}"
echo CUDA_VERSION is "${CUDA_VERSION}"
if [[ ${#CU_VERSION} -eq 5 ]]; then
CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
# Currenly, CU_VERSION and CUDA_VERSION are not consistent.
# to understand this code, see https://github.com/pytorch/vision/issues/4443
version="cpu"
if [[ ! -z "${CUDA_VERSION}" ]] ; then
version="$CUDA_VERSION"
else
if [[ ${#CU_VERSION} -eq 5 ]]; then
version="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
fi
fi
# It's a log to see if CU_VERSION exists, if not, we use environment CUDA_VERSION directly
# in unittest_windows_gpu, there's no CU_VERSION, but CUDA_VERSION.
echo "Using CUDA $CUDA_VERSION, CU_VERSION is $CU_VERSION now"
# Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi.
# It would exit the shell. One result is cpu tests would not run if the shell exit.
# Unless there's an error, Don't exit.
if [[ "$version" != "cpu" ]]; then
# set cuda envs
export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH"
export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
version=$CUDA_VERSION
if [ ! -d "$CUDA_PATH" ]; then
echo "$CUDA_PATH" does not exist
exit 1
fi
# set cuda envs
export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH"
export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
if [ ! -f "${CUDA_PATH}\include\nvjpeg.h" ]; then
echo "nvjpeg does not exist"
exit 1
fi
if [ ! -d "$CUDA_PATH" ]
then
echo "$CUDA_PATH" does not exist
exit 1
fi
# check cuda driver version
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
if [[ -x "$path" ]]; then
"$path" || echo "true";
break
fi
done
# check cuda driver version
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
if [[ -x "$path" ]]; then
"$path" || echo "true";
break
fi
done
which nvcc
which nvcc
nvcc --version
env | grep CUDA
fi
......@@ -9,6 +9,13 @@ set SRC_DIR=%~dp0\..
if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
rem in unit test workflow, we get CUDA_VERSION, for example 11.1
if defined CUDA_VERSION (
set CUDA_VER=%CUDA_VERSION:.=%
) else (
set CUDA_VER=%CU_VERSION:cu=%
)
set /a CUDA_VER=%CU_VERSION:cu=%
set CUDA_VER_MAJOR=%CUDA_VER:~0,-1%
set CUDA_VER_MINOR=%CUDA_VER:~-1,1%
......@@ -65,7 +72,7 @@ if not exist "%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_10.1.243_426.00_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe"
set "ARGS=nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
set "ARGS=nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvjpeg_10.1 nvjpeg_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
)
if not exist "%SRC_DIR%\temp_build\cudnn-10.1-windows10-x64-v7.6.4.38.zip" (
......@@ -82,7 +89,7 @@ if not exist "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_10.2.89_441.22_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe"
set "ARGS=nvcc_10.2 cuobjdump_10.2 nvprune_10.2 cupti_10.2 cublas_10.2 cublas_dev_10.2 cudart_10.2 cufft_10.2 cufft_dev_10.2 curand_10.2 curand_dev_10.2 cusolver_10.2 cusolver_dev_10.2 cusparse_10.2 cusparse_dev_10.2 nvgraph_10.2 nvgraph_dev_10.2 npp_10.2 npp_dev_10.2 nvrtc_10.2 nvrtc_dev_10.2 nvml_dev_10.2"
set "ARGS=nvcc_10.2 cuobjdump_10.2 nvprune_10.2 cupti_10.2 cublas_10.2 cublas_dev_10.2 cudart_10.2 cufft_10.2 cufft_dev_10.2 curand_10.2 curand_dev_10.2 cusolver_10.2 cusolver_dev_10.2 cusparse_10.2 cusparse_dev_10.2 nvgraph_10.2 nvgraph_dev_10.2 npp_10.2 npp_dev_10.2 nvjpeg_10.2 nvjpeg_dev_10.2 nvrtc_10.2 nvrtc_dev_10.2 nvml_dev_10.2"
)
if not exist "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip" (
......@@ -91,6 +98,15 @@ if not exist "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip" (
set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip"
)
rem The below only for cu102, if it's used in other version, e.g. cu111, torch.cuda.is_availabe() would be False.
if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.7z" (
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
if errorlevel 1 exit /b 1
)
echo Installing GPU driver DLLs
7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -aoa -o"C:\Windows\System32"
goto cuda_common
:cuda110
......@@ -99,7 +115,7 @@ if not exist "%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.0.2_451.48_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe"
set "ARGS=nvcc_11.0 cuobjdump_11.0 nvprune_11.0 nvprof_11.0 cupti_11.0 cublas_11.0 cublas_dev_11.0 cudart_11.0 cufft_11.0 cufft_dev_11.0 curand_11.0 curand_dev_11.0 cusolver_11.0 cusolver_dev_11.0 cusparse_11.0 cusparse_dev_11.0 npp_11.0 npp_dev_11.0 nvrtc_11.0 nvrtc_dev_11.0 nvml_dev_11.0"
set "ARGS=nvcc_11.0 cuobjdump_11.0 nvprune_11.0 nvprof_11.0 cupti_11.0 cublas_11.0 cublas_dev_11.0 cudart_11.0 cufft_11.0 cufft_dev_11.0 curand_11.0 curand_dev_11.0 cusolver_11.0 cusolver_dev_11.0 cusparse_11.0 cusparse_dev_11.0 npp_11.0 npp_dev_11.0 nvjpeg_11.0 nvjpeg_dev_11.0 nvrtc_11.0 nvrtc_dev_11.0 nvml_dev_11.0"
)
if not exist "%SRC_DIR%\temp_build\cudnn-11.0-windows-x64-v8.0.4.30.zip" (
......@@ -116,12 +132,12 @@ if not exist "%SRC_DIR%\temp_build\cuda_11.1.0_456.43_win10.exe" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.1.0_456.43_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.1.0_456.43_win10.exe"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.1.0_456.43_win10.exe"
set "ARGS=nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
set "ARGS=nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvjpeg_11.1 nvjpeg_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
)
@REM There is no downloadable driver for Tesla on CUDA 11.1 yet. We will use
@REM the driver inside CUDA
if "%JOB_EXECUTOR%" == "windows-with-nvidia-gpu" set "ARGS=%ARGS% Display.Driver"
set "ARGS=%ARGS% Display.Driver"
if not exist "%SRC_DIR%\temp_build\cudnn-11.1-windows-x64-v8.0.5.39.zip" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-11.1-windows-x64-v8.0.5.39.zip --output "%SRC_DIR%\temp_build\cudnn-11.1-windows-x64-v8.0.5.39.zip"
......@@ -137,7 +153,7 @@ if not exist "%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.2.0_460.89_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe"
set "ARGS=nvcc_11.2 cuobjdump_11.2 nvprune_11.2 nvprof_11.2 cupti_11.2 cublas_11.2 cublas_dev_11.2 cudart_11.2 cufft_11.2 cufft_dev_11.2 curand_11.2 curand_dev_11.2 cusolver_11.2 cusolver_dev_11.2 cusparse_11.2 cusparse_dev_11.2 npp_11.2 npp_dev_11.2 nvrtc_11.2 nvrtc_dev_11.2 nvml_dev_11.2"
set "ARGS=nvcc_11.2 cuobjdump_11.2 nvprune_11.2 nvprof_11.2 cupti_11.2 cublas_11.2 cublas_dev_11.2 cudart_11.2 cufft_11.2 cufft_dev_11.2 curand_11.2 curand_dev_11.2 cusolver_11.2 cusolver_dev_11.2 cusparse_11.2 cusparse_dev_11.2 npp_11.2 npp_dev_11.2 nvjpeg_11.2 nvjpeg_dev_11.2 nvrtc_11.2 nvrtc_dev_11.2 nvml_dev_11.2"
)
if not exist "%SRC_DIR%\temp_build\cudnn-11.2-windows-x64-v8.1.0.77.zip" (
......@@ -175,11 +191,6 @@ if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
if errorlevel 1 exit /b 1
)
if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.7z" (
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
if errorlevel 1 exit /b 1
)
echo Installing CUDA toolkit...
7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda"
pushd "%SRC_DIR%\temp_build\cuda"
......@@ -221,8 +232,5 @@ xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\bin\*.*" "%ProgramFiles%\NVIDIA GPU Co
xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\lib\x64\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
echo Installing GPU driver DLLs
7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -aoa -o"C:\Windows\System32"
echo Cleaning temp files
rd /s /q "%SRC_DIR%\temp_build" || ver > nul
set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe"
curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe
if errorlevel 1 exit /b 1
start /wait 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -s -noreboot
if errorlevel 1 exit /b 1
del 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe || ver > NUL
setlocal EnableDelayedExpansion
set NVIDIA_GPU_EXISTS=0
for /F "delims=" %%i in ('wmic path win32_VideoController get name') do (
set GPUS=%%i
if not "x!GPUS:NVIDIA=!" == "x!GPUS!" (
SET NVIDIA_GPU_EXISTS=1
goto gpu_check_end
)
)
:gpu_check_end
endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS%
if "%NVIDIA_GPU_EXISTS%" == "0" (
echo "CUDA Driver installation Failed"
exit /b 1
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment