Unverified Commit fc73954a authored by Chaitanya Sri Krishna Lolla's avatar Chaitanya Sri Krishna Lolla Committed by GitHub
Browse files

Merge branch 'master' into ifu_05152020

parents 3bae8c83 02a5274b
...@@ -6,11 +6,17 @@ from itertools import product ...@@ -6,11 +6,17 @@ from itertools import product
def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False): def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
# Exception handling for 18.04 compatibility # Exception handling for 18.04 compatibility
if check_overflow: if check_overflow:
if model_grad.is_sparse:
cpu_sum = float(model_grad.float()._values().sum())
else:
cpu_sum = float(model_grad.float().sum()) cpu_sum = float(model_grad.float().sum())
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
return True return True
if master_grad is not model_grad: # copy_ probably internally short-circuits this if master_grad is not model_grad: # copy_ probably internally short-circuits this
if model_grad.is_sparse:
master_grad.copy_(model_grad.to_dense())
else:
master_grad.copy_(model_grad) master_grad.copy_(model_grad)
if scale != 1.0: if scale != 1.0:
master_grad.mul_(scale) master_grad.mul_(scale)
...@@ -19,6 +25,9 @@ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=F ...@@ -19,6 +25,9 @@ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=F
def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a, b, check_overflow=False): def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a, b, check_overflow=False):
# Exception handling for 18.04 compatibility # Exception handling for 18.04 compatibility
if check_overflow: if check_overflow:
if model_grad.is_sparse:
cpu_sum = float(model_grad.float()._values().sum())
else:
cpu_sum = float(model_grad.float().sum()) cpu_sum = float(model_grad.float().sum())
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
return True return True
......
...@@ -172,8 +172,8 @@ void cuWelfordMuSigma2( ...@@ -172,8 +172,8 @@ void cuWelfordMuSigma2(
for (; l+7 < n2; l+=8*numx) { for (; l+7 < n2; l+=8*numx) {
for (int k = 0; k < 8; k+=2) { for (int k = 0; k < 8; k+=2) {
float2 curr = __half22float2(*((__half2*)(lvals+l+k))); float2 curr = __half22float2(*((__half2*)(lvals+l+k)));
cuWelfordOnlineSum(curr.x,mu,sigma2,count); cuWelfordOnlineSum<float>(curr.x,mu,sigma2,count);
cuWelfordOnlineSum(curr.y,mu,sigma2,count); cuWelfordOnlineSum<float>(curr.y,mu,sigma2,count);
} }
} }
for (; l < n2; ++l) { for (; l < n2; ++l) {
...@@ -230,9 +230,15 @@ void cuWelfordMuSigma2( ...@@ -230,9 +230,15 @@ void cuWelfordMuSigma2(
template<typename U> U rsqrt(U v) { template<typename U> U rsqrt(U v) {
return U(1) / sqrt(v); return U(1) / sqrt(v);
} }
#if defined __HIP_PLATFORM_HCC__
__device__ float rsqrt(float v) {
return rsqrtf(v);
}
#else
template<> float rsqrt(float v) { template<> float rsqrt(float v) {
return rsqrtf(v); return rsqrtf(v);
} }
#endif
template<> double rsqrt(double v) { template<> double rsqrt(double v) {
return rsqrt(v); return rsqrt(v);
} }
...@@ -293,7 +299,7 @@ void cuApplyLayerNorm( ...@@ -293,7 +299,7 @@ void cuApplyLayerNorm(
// 1) blockDim.x == warpSize // 1) blockDim.x == warpSize
// 2) Tensors are contiguous // 2) Tensors are contiguous
// //
for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
SharedMemory<U> shared; SharedMemory<U> shared;
U* buf = shared.getPointer(); U* buf = shared.getPointer();
U mu,sigma2; U mu,sigma2;
...@@ -531,7 +537,7 @@ void cuComputeGradInput( ...@@ -531,7 +537,7 @@ void cuComputeGradInput(
const T* gamma, const T* gamma,
T* grad_input) T* grad_input)
{ {
for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
U sum_loss1 = U(0); U sum_loss1 = U(0);
U sum_loss2 = U(0); U sum_loss2 = U(0);
const U c_mean = mean[i1]; const U c_mean = mean[i1];
......
...@@ -56,7 +56,7 @@ void multi_tensor_apply( ...@@ -56,7 +56,7 @@ void multi_tensor_apply(
for(int t = 0; t < tensor_lists[l].size(); t++) for(int t = 0; t < tensor_lists[l].size(); t++)
{ {
// TODO: Print which tensor fails. // TODO: Print which tensor fails.
bool contiguous_memory = tensor_lists[l][t].is_contiguous(); bool contiguous_memory = (tensor_lists[l][t].is_sparse()) ? tensor_lists[l][t]._values().is_contiguous() : tensor_lists[l][t].is_contiguous();
#ifdef VERSION_GE_1_5 #ifdef VERSION_GE_1_5
contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast)); contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
#endif #endif
...@@ -78,8 +78,15 @@ void multi_tensor_apply( ...@@ -78,8 +78,15 @@ void multi_tensor_apply(
for(int t = 0; t < ntensors; t++) for(int t = 0; t < ntensors; t++)
{ {
tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel(); tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
for(int d = 0; d < depth; d++) for(int d = 0; d < depth; d++) {
if (tensor_lists[d][t].is_sparse()) {
at::Tensor dst = at::zeros(tensor_lists[d][t].sizes(), tensor_lists[d][t].options().layout(at::kStrided));
dst.add_(tensor_lists[d][t]);
tl.addresses[d][loc_tensor_info] = dst.data_ptr();
} else {
tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
}
}
loc_tensor_info++; loc_tensor_info++;
int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size; int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
......
...@@ -141,8 +141,13 @@ __device__ __forceinline__ T reduce_block_into_lanes ...@@ -141,8 +141,13 @@ __device__ __forceinline__ T reduce_block_into_lanes
// __SYNCWARP(); // __SYNCWARP();
#pragma unroll #pragma unroll
for(int i = 16; i >= lanes; i >>= 1) for(int i = 16; i >= lanes; i >>= 1) {
#ifdef __HIP_PLATFORM_HCC__
final = final + __shfl_down(0xffffffff, final, i);
#else
final = final + __shfl_down_sync(0xffffffff, final, i); final = final + __shfl_down_sync(0xffffffff, final, i);
#endif
}
} }
if(share_result) if(share_result)
...@@ -191,8 +196,13 @@ __device__ __forceinline__ T reduce_block_into_lanes_max_op ...@@ -191,8 +196,13 @@ __device__ __forceinline__ T reduce_block_into_lanes_max_op
// __SYNCWARP(); // __SYNCWARP();
#pragma unroll #pragma unroll
for(int i = 16; i >= lanes; i >>= 1) for(int i = 16; i >= lanes; i >>= 1) {
#ifdef __HIP_PLATFORM_HCC__
final = fmaxf(fabsf(final), fabsf(__shfl_down(0xffffffff, final, i)));
#else
final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i))); final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
#endif
}
} }
if(share_result) if(share_result)
......
...@@ -6,6 +6,8 @@ import sys ...@@ -6,6 +6,8 @@ import sys
import warnings import warnings
import os import os
from torch.utils.hipify import hipify_python
# ninja build does not work unless include_dirs are abs path # ninja build does not work unless include_dirs are abs path
this_dir = os.path.dirname(os.path.abspath(__file__)) this_dir = os.path.dirname(os.path.abspath(__file__))
...@@ -100,11 +102,26 @@ if "--cuda_ext" in sys.argv: ...@@ -100,11 +102,26 @@ if "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension from torch.utils.cpp_extension import CUDAExtension
sys.argv.remove("--cuda_ext") sys.argv.remove("--cuda_ext")
if torch.utils.cpp_extension.CUDA_HOME is None: is_rocm_pytorch = False
if torch.__version__ >= '1.5':
from torch.utils.cpp_extension import ROCM_HOME
is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
if torch.utils.cpp_extension.CUDA_HOME is None and (not is_rocm_pytorch):
raise RuntimeError("--cuda_ext was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") raise RuntimeError("--cuda_ext was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
else: else:
if not is_rocm_pytorch:
check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME) check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
if is_rocm_pytorch:
import shutil
with hipify_python.GeneratedFileCleaner(keep_intermediates=True) as clean_ctx:
hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*",
show_detailed=True, is_pytorch_extension=True, clean_ctx=clean_ctx)
shutil.copy("csrc/compat.h", "csrc/hip/compat.h")
shutil.copy("csrc/type_shim.h", "csrc/hip/type_shim.h")
if not is_rocm_pytorch:
ext_modules.append( ext_modules.append(
CUDAExtension(name='amp_C', CUDAExtension(name='amp_C',
sources=['csrc/amp_C_frontend.cpp', sources=['csrc/amp_C_frontend.cpp',
...@@ -123,13 +140,35 @@ if "--cuda_ext" in sys.argv: ...@@ -123,13 +140,35 @@ if "--cuda_ext" in sys.argv:
'-O3', '-O3',
# '--resource-usage', # '--resource-usage',
'--use_fast_math'] + version_dependent_macros})) '--use_fast_math'] + version_dependent_macros}))
else:
print ("INFO: Building Multitensor apply extension")
ext_modules.append(
CUDAExtension(name='amp_C',
sources=['csrc/amp_C_frontend.cpp',
'csrc/hip/multi_tensor_sgd_kernel.hip',
'csrc/hip/multi_tensor_scale_kernel.hip',
'csrc/hip/multi_tensor_axpby_kernel.hip',
'csrc/hip/multi_tensor_l2norm_kernel.hip',
'csrc/hip/multi_tensor_lamb_stage_1.hip',
'csrc/hip/multi_tensor_lamb_stage_2.hip',
'csrc/hip/multi_tensor_adam.hip',
'csrc/hip/multi_tensor_novograd.hip',
'csrc/hip/multi_tensor_lamb.hip'],
extra_compile_args={'cxx' : ['-O3'] + version_dependent_macros,
'nvcc': []}))
if not is_rocm_pytorch:
ext_modules.append( ext_modules.append(
CUDAExtension(name='syncbn', CUDAExtension(name='syncbn',
sources=['csrc/syncbn.cpp', sources=['csrc/syncbn.cpp',
'csrc/welford.cu'], 'csrc/welford.cu'],
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros, extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
'nvcc':['-O3'] + version_dependent_macros})) 'nvcc':['-O3'] + version_dependent_macros}))
else:
print ("INFO: Skipping syncbn extension.")
if not is_rocm_pytorch:
ext_modules.append( ext_modules.append(
CUDAExtension(name='fused_layer_norm_cuda', CUDAExtension(name='fused_layer_norm_cuda',
sources=['csrc/layer_norm_cuda.cpp', sources=['csrc/layer_norm_cuda.cpp',
...@@ -138,13 +177,24 @@ if "--cuda_ext" in sys.argv: ...@@ -138,13 +177,24 @@ if "--cuda_ext" in sys.argv:
'nvcc':['-maxrregcount=50', 'nvcc':['-maxrregcount=50',
'-O3', '-O3',
'--use_fast_math'] + version_dependent_macros})) '--use_fast_math'] + version_dependent_macros}))
else:
print ("INFO: Building FusedLayerNorm extension.")
ext_modules.append(
CUDAExtension(name='fused_layer_norm_cuda',
sources=['csrc/layer_norm_cuda.cpp',
'csrc/hip/layer_norm_hip_kernel.hip'],
extra_compile_args={'cxx' : ['-O3'] + version_dependent_macros,
'nvcc' : []}))
if not is_rocm_pytorch:
ext_modules.append( ext_modules.append(
CUDAExtension(name='mlp_cuda', CUDAExtension(name='mlp_cuda',
sources=['csrc/mlp.cpp', sources=['csrc/mlp.cpp',
'csrc/mlp_cuda.cu'], 'csrc/mlp_cuda.cu'],
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros, extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
'nvcc':['-O3'] + version_dependent_macros})) 'nvcc':['-O3'] + version_dependent_macros}))
else:
print ("INFO: Skipping MLP extension")
if "--bnp" in sys.argv: if "--bnp" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension from torch.utils.cpp_extension import CUDAExtension
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment