"vscode:/vscode.git/clone" did not exist on "2df85862a51711b49abd95b8797900a6511599d3"
Unverified Commit 1235f3b0 authored by Hang Zhang's avatar Hang Zhang Committed by GitHub
Browse files

Support PyTorch 1.6.0 (#309)

* doc

* pre-compile

* fix dispach
parent f46bcf7f
...@@ -10,11 +10,16 @@ Installation ...@@ -10,11 +10,16 @@ Installation
* PIP Install:: * PIP Install::
pip install torch-encoding --pre pip install torch-encoding --pre
# macOS
CC=clang CXX=clang++ pip install torch-encoding --pre
* Install from source:: * Install from source::
git clone https://github.com/zhanghang1989/PyTorch-Encoding && cd PyTorch-Encoding git clone https://github.com/zhanghang1989/PyTorch-Encoding && cd PyTorch-Encoding
# ubuntu
python setup.py install python setup.py install
# macOS
CC=clang CXX=clang++ python setup.py install
Using Docker Using Docker
......
...@@ -11,7 +11,10 @@ ...@@ -11,7 +11,10 @@
import torch import torch
from torch.autograd import Variable, Function from torch.autograd import Variable, Function
from .. import lib
from encoding import cpu
if torch.cuda.device_count() > 0:
from encoding import gpu
__all__ = ['NonMaxSuppression'] __all__ = ['NonMaxSuppression']
...@@ -49,6 +52,6 @@ def NonMaxSuppression(boxes, scores, threshold): ...@@ -49,6 +52,6 @@ def NonMaxSuppression(boxes, scores, threshold):
>>> surviving_box_indices = indices[mask] >>> surviving_box_indices = indices[mask]
""" """
if boxes.is_cuda: if boxes.is_cuda:
return lib.gpu.non_max_suppression(boxes, scores, threshold) return gpu.non_max_suppression(boxes, scores, threshold)
else: else:
return lib.cpu.non_max_suppression(boxes, scores, threshold) return cpu.non_max_suppression(boxes, scores, threshold)
...@@ -8,7 +8,10 @@ ...@@ -8,7 +8,10 @@
import torch import torch
from torch.autograd.function import Function from torch.autograd.function import Function
from .. import lib
from encoding import cpu
if torch.cuda.device_count() > 0:
from encoding import gpu
__all__ = ['dist_syncbatchnorm'] __all__ = ['dist_syncbatchnorm']
...@@ -25,9 +28,9 @@ class dist_syncbatchnorm_(Function): ...@@ -25,9 +28,9 @@ class dist_syncbatchnorm_(Function):
_ex, _var = running_mean.contiguous(), running_var.contiguous() _ex, _var = running_mean.contiguous(), running_var.contiguous()
_exs = _var + _ex ** 2 _exs = _var + _ex ** 2
if x.is_cuda: if x.is_cuda:
y = lib.gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps) y = gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
else: else:
y = lib.cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps) y = cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
ctx.save_for_backward(x, _ex, _exs, gamma, beta) ctx.save_for_backward(x, _ex, _exs, gamma, beta)
return y return y
...@@ -36,7 +39,7 @@ class dist_syncbatchnorm_(Function): ...@@ -36,7 +39,7 @@ class dist_syncbatchnorm_(Function):
raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size)) raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
if x.is_cuda: if x.is_cuda:
_ex, _exs = lib.gpu.expectation_forward(x) _ex, _exs = gpu.expectation_forward(x)
else: else:
raise NotImplemented raise NotImplemented
...@@ -62,9 +65,9 @@ class dist_syncbatchnorm_(Function): ...@@ -62,9 +65,9 @@ class dist_syncbatchnorm_(Function):
# BN forward + activation # BN forward + activation
if x.is_cuda: if x.is_cuda:
y = lib.gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps) y = gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
else: else:
y = lib.cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps) y = cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
ctx.save_for_backward(x, _ex, _exs, gamma, beta) ctx.save_for_backward(x, _ex, _exs, gamma, beta)
return y return y
...@@ -77,7 +80,7 @@ class dist_syncbatchnorm_(Function): ...@@ -77,7 +80,7 @@ class dist_syncbatchnorm_(Function):
# BN backward # BN backward
if dz.is_cuda: if dz.is_cuda:
dx, _dex, _dexs, dgamma, dbeta = \ dx, _dex, _dexs, dgamma, dbeta = \
lib.gpu.batchnorm_backward(dz, x, _ex, _exs, gamma, beta, ctx.eps) gpu.batchnorm_backward(dz, x, _ex, _exs, gamma, beta, ctx.eps)
else: else:
raise NotImplemented raise NotImplemented
...@@ -96,7 +99,7 @@ class dist_syncbatchnorm_(Function): ...@@ -96,7 +99,7 @@ class dist_syncbatchnorm_(Function):
_dexs = _dexs / count _dexs = _dexs / count
if x.is_cuda: if x.is_cuda:
dx_ = lib.gpu.expectation_backward(x, _dex, _dexs) dx_ = gpu.expectation_backward(x, _dex, _dexs)
else: else:
raise NotImplemented raise NotImplemented
dx = dx + dx_ dx = dx + dx_
......
...@@ -11,7 +11,10 @@ ...@@ -11,7 +11,10 @@
import torch import torch
from torch.autograd import Function, Variable from torch.autograd import Function, Variable
import torch.nn.functional as F import torch.nn.functional as F
from .. import lib
from encoding import cpu
if torch.cuda.device_count() > 0:
from encoding import gpu
__all__ = ['aggregate', 'scaled_l2', 'pairwise_cosine'] __all__ = ['aggregate', 'scaled_l2', 'pairwise_cosine']
...@@ -21,18 +24,18 @@ class _aggregate(Function): ...@@ -21,18 +24,18 @@ class _aggregate(Function):
# A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD) # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
ctx.save_for_backward(A, X, C) ctx.save_for_backward(A, X, C)
if A.is_cuda: if A.is_cuda:
E = lib.gpu.aggregate_forward(A, X, C) E = gpu.aggregate_forward(A, X, C)
else: else:
E = lib.cpu.aggregate_forward(A, X, C) E = cpu.aggregate_forward(A, X, C)
return E return E
@staticmethod @staticmethod
def backward(ctx, gradE): def backward(ctx, gradE):
A, X, C = ctx.saved_variables A, X, C = ctx.saved_variables
if A.is_cuda: if A.is_cuda:
gradA, gradX, gradC = lib.gpu.aggregate_backward(gradE, A, X, C) gradA, gradX, gradC = gpu.aggregate_backward(gradE, A, X, C)
else: else:
gradA, gradX, gradC = lib.cpu.aggregate_backward(gradE, A, X, C) gradA, gradX, gradC = cpu.aggregate_backward(gradE, A, X, C)
return gradA, gradX, gradC return gradA, gradX, gradC
def aggregate(A, X, C): def aggregate(A, X, C):
...@@ -64,9 +67,9 @@ class _scaled_l2(Function): ...@@ -64,9 +67,9 @@ class _scaled_l2(Function):
@staticmethod @staticmethod
def forward(ctx, X, C, S): def forward(ctx, X, C, S):
if X.is_cuda: if X.is_cuda:
SL = lib.gpu.scaled_l2_forward(X, C, S) SL = gpu.scaled_l2_forward(X, C, S)
else: else:
SL = lib.cpu.scaled_l2_forward(X, C, S) SL = cpu.scaled_l2_forward(X, C, S)
ctx.save_for_backward(X, C, S, SL) ctx.save_for_backward(X, C, S, SL)
return SL return SL
...@@ -74,9 +77,9 @@ class _scaled_l2(Function): ...@@ -74,9 +77,9 @@ class _scaled_l2(Function):
def backward(ctx, gradSL): def backward(ctx, gradSL):
X, C, S, SL = ctx.saved_variables X, C, S, SL = ctx.saved_variables
if X.is_cuda: if X.is_cuda:
gradX, gradC, gradS = lib.gpu.scaled_l2_backward(gradSL, X, C, S, SL) gradX, gradC, gradS = gpu.scaled_l2_backward(gradSL, X, C, S, SL)
else: else:
gradX, gradC, gradS = lib.cpu.scaled_l2_backward(gradSL, X, C, S, SL) gradX, gradC, gradS = cpu.scaled_l2_backward(gradSL, X, C, S, SL)
return gradX, gradC, gradS return gradX, gradC, gradS
def scaled_l2(X, C, S): def scaled_l2(X, C, S):
......
...@@ -10,7 +10,9 @@ ...@@ -10,7 +10,9 @@
import torch import torch
from torch.autograd import Function from torch.autograd import Function
from .. import lib from encoding import cpu
if torch.cuda.device_count() > 0:
from encoding import gpu
__all__ = ['rectify'] __all__ = ['rectify']
...@@ -26,9 +28,9 @@ class _rectify(Function): ...@@ -26,9 +28,9 @@ class _rectify(Function):
ctx.dilation = dilation ctx.dilation = dilation
ctx.average = average ctx.average = average
if x.is_cuda: if x.is_cuda:
lib.gpu.conv_rectify(y, x, kernel_size, stride, padding, dilation, average) gpu.conv_rectify(y, x, kernel_size, stride, padding, dilation, average)
else: else:
lib.cpu.conv_rectify(y, x, kernel_size, stride, padding, dilation, average) cpu.conv_rectify(y, x, kernel_size, stride, padding, dilation, average)
ctx.mark_dirty(y) ctx.mark_dirty(y)
return y return y
...@@ -36,10 +38,10 @@ class _rectify(Function): ...@@ -36,10 +38,10 @@ class _rectify(Function):
def backward(ctx, grad_y): def backward(ctx, grad_y):
x, = ctx.saved_variables x, = ctx.saved_variables
if x.is_cuda: if x.is_cuda:
lib.gpu.conv_rectify(grad_y, x, ctx.kernel_size, ctx.stride, gpu.conv_rectify(grad_y, x, ctx.kernel_size, ctx.stride,
ctx.padding, ctx.dilation, ctx.average) ctx.padding, ctx.dilation, ctx.average)
else: else:
lib.cpu.conv_rectify(grad_y, x, ctx.kernel_size, ctx.stride, cpu.conv_rectify(grad_y, x, ctx.kernel_size, ctx.stride,
ctx.padding, ctx.dilation, ctx.average) ctx.padding, ctx.dilation, ctx.average)
ctx.mark_dirty(grad_y) ctx.mark_dirty(grad_y)
return grad_y, None, None, None, None, None, None return grad_y, None, None, None, None, None, None
......
...@@ -12,7 +12,10 @@ import torch ...@@ -12,7 +12,10 @@ import torch
import torch.cuda.comm as comm import torch.cuda.comm as comm
from torch.autograd import Function from torch.autograd import Function
from torch.autograd.function import once_differentiable from torch.autograd.function import once_differentiable
from .. import lib
from encoding import cpu
if torch.cuda.device_count() > 0:
from encoding import gpu
__all__ = ['moments', 'syncbatchnorm', 'inp_syncbatchnorm'] __all__ = ['moments', 'syncbatchnorm', 'inp_syncbatchnorm']
...@@ -20,7 +23,7 @@ class moments_(Function): ...@@ -20,7 +23,7 @@ class moments_(Function):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
if x.is_cuda: if x.is_cuda:
ex, ex2 = lib.gpu.expectation_forward(x) ex, ex2 = gpu.expectation_forward(x)
else: else:
raise NotImplemented raise NotImplemented
ctx.save_for_backward(x) ctx.save_for_backward(x)
...@@ -30,7 +33,7 @@ class moments_(Function): ...@@ -30,7 +33,7 @@ class moments_(Function):
def backward(ctx, dex, dex2): def backward(ctx, dex, dex2):
x, = ctx.saved_tensors x, = ctx.saved_tensors
if dex.is_cuda: if dex.is_cuda:
dx = lib.gpu.expectation_backward(x, dex, dex2) dx = gpu.expectation_backward(x, dex, dex2)
else: else:
raise NotImplemented raise NotImplemented
return dx return dx
...@@ -57,7 +60,7 @@ class syncbatchnorm_(Function): ...@@ -57,7 +60,7 @@ class syncbatchnorm_(Function):
if ctx.training: if ctx.training:
if x.is_cuda: if x.is_cuda:
_ex, _exs = lib.gpu.expectation_forward(x) _ex, _exs = gpu.expectation_forward(x)
else: else:
raise NotImplemented raise NotImplemented
...@@ -94,9 +97,9 @@ class syncbatchnorm_(Function): ...@@ -94,9 +97,9 @@ class syncbatchnorm_(Function):
# BN forward + activation # BN forward + activation
if x.is_cuda: if x.is_cuda:
y = lib.gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps) y = gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
else: else:
y = lib.cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps) y = cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
# Output # Output
ctx.save_for_backward(x, _ex, _exs, gamma, beta) ctx.save_for_backward(x, _ex, _exs, gamma, beta)
...@@ -111,7 +114,7 @@ class syncbatchnorm_(Function): ...@@ -111,7 +114,7 @@ class syncbatchnorm_(Function):
# BN backward # BN backward
if dz.is_cuda: if dz.is_cuda:
dx, _dex, _dexs, dgamma, dbeta = \ dx, _dex, _dexs, dgamma, dbeta = \
lib.gpu.batchnorm_backward(dz, x, _ex, _exs, gamma, beta, ctx.eps) gpu.batchnorm_backward(dz, x, _ex, _exs, gamma, beta, ctx.eps)
else: else:
raise NotImplemented raise NotImplemented
...@@ -137,7 +140,7 @@ class syncbatchnorm_(Function): ...@@ -137,7 +140,7 @@ class syncbatchnorm_(Function):
ctx.worker_queue.task_done() ctx.worker_queue.task_done()
if x.is_cuda: if x.is_cuda:
dx_ = lib.gpu.expectation_backward(x, _dex, _dexs) dx_ = gpu.expectation_backward(x, _dex, _dexs)
else: else:
raise NotImplemented raise NotImplemented
dx = dx + dx_ dx = dx + dx_
...@@ -158,7 +161,7 @@ class syncbatchnorm_(Function): ...@@ -158,7 +161,7 @@ class syncbatchnorm_(Function):
def _act_forward(ctx, x): def _act_forward(ctx, x):
if ctx.activation.lower() == "leaky_relu": if ctx.activation.lower() == "leaky_relu":
if x.is_cuda: if x.is_cuda:
lib.gpu.leaky_relu_forward(x, ctx.slope) gpu.leaky_relu_forward(x, ctx.slope)
else: else:
raise NotImplemented raise NotImplemented
else: else:
...@@ -167,7 +170,7 @@ def _act_forward(ctx, x): ...@@ -167,7 +170,7 @@ def _act_forward(ctx, x):
def _act_backward(ctx, x, dx): def _act_backward(ctx, x, dx):
if ctx.activation.lower() == "leaky_relu": if ctx.activation.lower() == "leaky_relu":
if x.is_cuda: if x.is_cuda:
lib.gpu.leaky_relu_backward(x, dx, ctx.slope) gpu.leaky_relu_backward(x, dx, ctx.slope)
else: else:
raise NotImplemented raise NotImplemented
else: else:
...@@ -194,7 +197,7 @@ class inp_syncbatchnorm_(Function): ...@@ -194,7 +197,7 @@ class inp_syncbatchnorm_(Function):
if ctx.training: if ctx.training:
if x.is_cuda: if x.is_cuda:
_ex, _exs = lib.gpu.expectation_forward(x) _ex, _exs = gpu.expectation_forward(x)
else: else:
raise NotImplemented raise NotImplemented
...@@ -232,7 +235,7 @@ class inp_syncbatchnorm_(Function): ...@@ -232,7 +235,7 @@ class inp_syncbatchnorm_(Function):
# BN forward + activation # BN forward + activation
if x.is_cuda: if x.is_cuda:
lib.gpu.batchnorm_inp_forward(x, _ex, _exs, gamma, beta, ctx.eps) gpu.batchnorm_inp_forward(x, _ex, _exs, gamma, beta, ctx.eps)
else: else:
raise NotImplemented raise NotImplemented
...@@ -254,7 +257,7 @@ class inp_syncbatchnorm_(Function): ...@@ -254,7 +257,7 @@ class inp_syncbatchnorm_(Function):
# BN backward # BN backward
if dz.is_cuda: if dz.is_cuda:
dx, _dex, _dexs, dgamma, dbeta = \ dx, _dex, _dexs, dgamma, dbeta = \
lib.gpu.batchnorm_inp_backward(dz, z, _ex, _exs, gamma, beta, ctx.eps) gpu.batchnorm_inp_backward(dz, z, _ex, _exs, gamma, beta, ctx.eps)
else: else:
raise NotImplemented raise NotImplemented
...@@ -280,7 +283,7 @@ class inp_syncbatchnorm_(Function): ...@@ -280,7 +283,7 @@ class inp_syncbatchnorm_(Function):
ctx.worker_queue.task_done() ctx.worker_queue.task_done()
if z.is_cuda: if z.is_cuda:
lib.gpu.expectation_inp_backward(dx, z, _dex, _dexs, _ex, _exs, gamma, beta, ctx.eps) gpu.expectation_inp_backward(dx, z, _dex, _dexs, _ex, _exs, gamma, beta, ctx.eps)
else: else:
raise NotImplemented raise NotImplemented
......
import os
import torch
from torch.utils.cpp_extension import load
cwd = os.path.dirname(os.path.realpath(__file__))
cpu_path = os.path.join(cwd, 'cpu')
gpu_path = os.path.join(cwd, 'gpu')
cpu = load('enclib_cpu', [
os.path.join(cpu_path, 'operator.cpp'),
os.path.join(cpu_path, 'encoding_cpu.cpp'),
os.path.join(cpu_path, 'syncbn_cpu.cpp'),
os.path.join(cpu_path, 'roi_align_cpu.cpp'),
os.path.join(cpu_path, 'nms_cpu.cpp'),
os.path.join(cpu_path, 'rectify_cpu.cpp'),
], build_directory=cpu_path, verbose=False)
if torch.cuda.is_available():
gpu = load('enclib_gpu', [
os.path.join(gpu_path, 'operator.cpp'),
os.path.join(gpu_path, 'activation_kernel.cu'),
os.path.join(gpu_path, 'encoding_kernel.cu'),
os.path.join(gpu_path, 'syncbn_kernel.cu'),
os.path.join(gpu_path, 'roi_align_kernel.cu'),
os.path.join(gpu_path, 'nms_kernel.cu'),
os.path.join(gpu_path, 'rectify_cuda.cu'),
os.path.join(gpu_path, 'lib_ssd.cu'),
], extra_cuda_cflags=["--expt-extended-lambda"],
build_directory=gpu_path, verbose=False)
...@@ -194,7 +194,7 @@ void conv_rectify_cpu_tempalte( ...@@ -194,7 +194,7 @@ void conv_rectify_cpu_tempalte(
at::Tensor input = input_.contiguous(); at::Tensor input = input_.contiguous();
AT_DISPATCH_FLOATING_TYPES(input.type(), "conv_rectify_cuda_frame", ([&] { AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "conv_rectify_cuda_frame", ([&] {
scalar_t *output_data = output.data_ptr<scalar_t>(); scalar_t *output_data = output.data_ptr<scalar_t>();
conv_rectify_cpu_frame<scalar_t>( conv_rectify_cpu_frame<scalar_t>(
output_data, output_data,
......
...@@ -410,7 +410,7 @@ at::Tensor ROIAlign_Forward_CPU( ...@@ -410,7 +410,7 @@ at::Tensor ROIAlign_Forward_CPU(
AT_ASSERT(input.is_contiguous()); AT_ASSERT(input.is_contiguous());
AT_ASSERT(bottom_rois.is_contiguous()); AT_ASSERT(bottom_rois.is_contiguous());
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_Forward_CPU", ([&] { AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_Forward_CPU", ([&] {
ROIAlignForwardCompute<scalar_t>( ROIAlignForwardCompute<scalar_t>(
output.numel(), output.numel(),
input.data<scalar_t>(), input.data<scalar_t>(),
......
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CppExtension
setup(
name='enclib_cpu',
ext_modules=[
CppExtension('enclib_cpu', [
'operator.cpp',
'roi_align_cpu.cpp',
'encoding_cpu.cpp',
'syncbn_cpu.cpp',
'nms_cpu.cpp',
]),
],
cmdclass={
'build_ext': BuildExtension
})
...@@ -10,19 +10,19 @@ ...@@ -10,19 +10,19 @@
namespace { namespace {
template<typename T> // template<typename T>
inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) { // inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
// Create thrust pointers // // Create thrust pointers
thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z); // thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz); // thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
//
thrust::transform_if(th_dz, th_dz + count, th_z, th_dz, // thrust::transform_if(th_dz, th_dz + count, th_z, th_dz,
[slope] __device__ (const T& dz) { return dz * slope; }, // [slope] __device__ (const T& dz) { return dz * slope; },
[] __device__ (const T& z) { return z < 0; }); // [] __device__ (const T& z) { return z < 0; });
thrust::transform_if(th_z, th_z + count, th_z, // thrust::transform_if(th_z, th_z + count, th_z,
[slope] __device__ (const T& z) { return z / slope; }, // [slope] __device__ (const T& z) { return z / slope; },
[] __device__ (const T& z) { return z < 0; }); // [] __device__ (const T& z) { return z < 0; });
} // }
} }
...@@ -33,12 +33,12 @@ void LeakyRelu_Forward_CUDA(at::Tensor z, float slope) { ...@@ -33,12 +33,12 @@ void LeakyRelu_Forward_CUDA(at::Tensor z, float slope) {
void LeakyRelu_Backward_CUDA(at::Tensor z, at::Tensor dz, float slope) { void LeakyRelu_Backward_CUDA(at::Tensor z, at::Tensor dz, float slope) {
int64_t count = z.numel(); int64_t count = z.numel();
/*
AT_DISPATCH_FLOATING_TYPES(z.type(), "LeakyRelu_Backward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(z.type(), "LeakyRelu_Backward_CUDA", ([&] {
leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count); leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
})); }));
/* */
// unstable after scaling // unstable after scaling
at::leaky_relu_(z, 1.0 / slope); at::leaky_relu_(z, 1.0 / slope);
at::leaky_relu_backward(dz, z, slope); at::leaky_relu_backward(dz, z, slope);
*/
} }
...@@ -172,7 +172,7 @@ at::Tensor Aggregate_Forward_CUDA( ...@@ -172,7 +172,7 @@ at::Tensor Aggregate_Forward_CUDA(
dim3 blocks(C_.size(1), C_.size(0), X_.size(0)); dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
dim3 threads(getNumThreads(X_.size(1))); dim3 threads(getNumThreads(X_.size(1)));
AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Forward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(A_.scalar_type(), "Aggregate_Forward_CUDA", ([&] {
DeviceTensor<scalar_t, 3> E = devicetensor<scalar_t, 3>(E_); DeviceTensor<scalar_t, 3> E = devicetensor<scalar_t, 3>(E_);
DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_); DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_); DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
...@@ -197,7 +197,7 @@ std::vector<at::Tensor> Aggregate_Backward_CUDA( ...@@ -197,7 +197,7 @@ std::vector<at::Tensor> Aggregate_Backward_CUDA(
// B, K, D // B, K, D
dim3 blocks(C_.size(0), X_.size(1), X_.size(0)); dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1))); dim3 threads(getNumThreads(C_.size(1)));
AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Backward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(A_.scalar_type(), "Aggregate_Backward_CUDA", ([&] {
/* Device tensors */ /* Device tensors */
DeviceTensor<scalar_t, 3> GA = devicetensor<scalar_t, 3>(gradA_); DeviceTensor<scalar_t, 3> GA = devicetensor<scalar_t, 3>(gradA_);
DeviceTensor<scalar_t, 3> GE = devicetensor<scalar_t, 3>(GE_); DeviceTensor<scalar_t, 3> GE = devicetensor<scalar_t, 3>(GE_);
...@@ -220,7 +220,7 @@ at::Tensor ScaledL2_Forward_CUDA( ...@@ -220,7 +220,7 @@ at::Tensor ScaledL2_Forward_CUDA(
dim3 blocks(C_.size(0), X_.size(1), X_.size(0)); dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1))); dim3 threads(getNumThreads(C_.size(1)));
AT_DISPATCH_FLOATING_TYPES(X_.type(), "ScaledL2_Forward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(X_.scalar_type(), "ScaledL2_Forward_CUDA", ([&] {
/* Device tensors */ /* Device tensors */
DeviceTensor<scalar_t, 3> SL = devicetensor<scalar_t, 3>(SL_); DeviceTensor<scalar_t, 3> SL = devicetensor<scalar_t, 3>(SL_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_); DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
...@@ -249,7 +249,7 @@ std::vector<at::Tensor> ScaledL2_Backward_CUDA( ...@@ -249,7 +249,7 @@ std::vector<at::Tensor> ScaledL2_Backward_CUDA(
dim3 blocks2(C_.size(1), C_.size(0)); dim3 blocks2(C_.size(1), C_.size(0));
dim3 threads2(getNumThreads(X_.size(1))); dim3 threads2(getNumThreads(X_.size(1)));
auto GS_ = (GSL_ * (SL_ / S_.view({1, 1, C_.size(0)}))).sum(0).sum(0); auto GS_ = (GSL_ * (SL_ / S_.view({1, 1, C_.size(0)}))).sum(0).sum(0);
AT_DISPATCH_FLOATING_TYPES(X_.type(), "ScaledL2_Backward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(X_.scalar_type(), "ScaledL2_Backward_CUDA", ([&] {
/* Device tensors */ /* Device tensors */
DeviceTensor<scalar_t, 3> GSL = devicetensor<scalar_t, 3>(GSL_); DeviceTensor<scalar_t, 3> GSL = devicetensor<scalar_t, 3>(GSL_);
DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_); DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
......
...@@ -179,7 +179,7 @@ void conv_rectify_cuda_tempalte( ...@@ -179,7 +179,7 @@ void conv_rectify_cuda_tempalte(
const uint32_t num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); const uint32_t num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
const uint32_t num_blocks = at::cuda::ATenCeilDiv<uint32_t>(count, num_threads); const uint32_t num_blocks = at::cuda::ATenCeilDiv<uint32_t>(count, num_threads);
AT_DISPATCH_FLOATING_TYPES(input.type(), "conv_rectify_cuda_frame", ([&] { AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "conv_rectify_cuda_frame", ([&] {
//using accscalar_t = acc_type<scalar_t, true>; //using accscalar_t = acc_type<scalar_t, true>;
scalar_t *output_data = output.data_ptr<scalar_t>(); scalar_t *output_data = output.data_ptr<scalar_t>();
conv_rectify_cuda_frame<scalar_t, scalar_t> conv_rectify_cuda_frame<scalar_t, scalar_t>
......
...@@ -372,7 +372,7 @@ at::Tensor ROIAlign_Forward_CUDA( ...@@ -372,7 +372,7 @@ at::Tensor ROIAlign_Forward_CUDA(
auto count = output.numel(); auto count = output.numel();
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_Forward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_Forward_CUDA", ([&] {
RoIAlignForwardKernel<scalar_t> RoIAlignForwardKernel<scalar_t>
<<<ROI_GET_BLOCKS(count), <<<ROI_GET_BLOCKS(count),
ROI_CUDA_NUM_THREADS, ROI_CUDA_NUM_THREADS,
...@@ -419,7 +419,7 @@ at::Tensor ROIAlign_Backward_CUDA( ...@@ -419,7 +419,7 @@ at::Tensor ROIAlign_Backward_CUDA(
auto num_rois = rois.size(0); auto num_rois = rois.size(0);
auto count = grad_output.numel(); auto count = grad_output.numel();
AT_DISPATCH_FLOATING_TYPES(rois.type(), "ROIAlign_Backward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(rois.scalar_type(), "ROIAlign_Backward_CUDA", ([&] {
RoIAlignBackwardKernel<scalar_t> RoIAlignBackwardKernel<scalar_t>
<<<ROI_GET_BLOCKS(count), <<<ROI_GET_BLOCKS(count),
ROI_CUDA_NUM_THREADS, ROI_CUDA_NUM_THREADS,
......
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='enclib_gpu',
ext_modules=[
CUDAExtension('enclib_gpu', [
'operator.cpp',
'activation_kernel.cu',
'encoding_kernel.cu',
'syncbn_kernel.cu',
'roi_align_kernel.cu',
'nms_kernel.cu',
'rectify.cu',
]),
],
cmdclass={
'build_ext': BuildExtension
})
...@@ -274,7 +274,7 @@ at::Tensor BatchNorm_Forward_CUDA( ...@@ -274,7 +274,7 @@ at::Tensor BatchNorm_Forward_CUDA(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(input_.size(1)); dim3 blocks(input_.size(1));
dim3 threads(getNumThreads(input_.size(2))); dim3 threads(getNumThreads(input_.size(2)));
AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Forward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(input_.scalar_type(), "BatchNorm_Forward_CUDA", ([&] {
/* Device tensors */ /* Device tensors */
DeviceTensor<scalar_t, 3> output = devicetensor<scalar_t, 3>(output_); DeviceTensor<scalar_t, 3> output = devicetensor<scalar_t, 3>(output_);
DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_); DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
...@@ -301,7 +301,7 @@ at::Tensor BatchNorm_Forward_Inp_CUDA( ...@@ -301,7 +301,7 @@ at::Tensor BatchNorm_Forward_Inp_CUDA(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(input_.size(1)); dim3 blocks(input_.size(1));
dim3 threads(getNumThreads(input_.size(2))); dim3 threads(getNumThreads(input_.size(2)));
AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Forward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(input_.scalar_type(), "BatchNorm_Forward_CUDA", ([&] {
/* Device tensors */ /* Device tensors */
DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_); DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
DeviceTensor<scalar_t, 1> ex = devicetensor<scalar_t, 1>(ex_); DeviceTensor<scalar_t, 1> ex = devicetensor<scalar_t, 1>(ex_);
...@@ -336,7 +336,7 @@ std::vector<at::Tensor> BatchNorm_Inp_Backward_CUDA( ...@@ -336,7 +336,7 @@ std::vector<at::Tensor> BatchNorm_Inp_Backward_CUDA(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(output_.size(1)); dim3 blocks(output_.size(1));
dim3 threads(getNumThreads(output_.size(2))); dim3 threads(getNumThreads(output_.size(2)));
AT_DISPATCH_FLOATING_TYPES(output_.type(), "BatchNorm_Inp_Backward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(output_.scalar_type(), "BatchNorm_Inp_Backward_CUDA", ([&] {
/* Device tensors */ /* Device tensors */
DeviceTensor<scalar_t, 3> gradoutput = devicetensor<scalar_t, 3>(gradoutput_); DeviceTensor<scalar_t, 3> gradoutput = devicetensor<scalar_t, 3>(gradoutput_);
DeviceTensor<scalar_t, 3> output = devicetensor<scalar_t, 3>(output_); DeviceTensor<scalar_t, 3> output = devicetensor<scalar_t, 3>(output_);
...@@ -379,7 +379,7 @@ std::vector<at::Tensor> BatchNorm_Backward_CUDA( ...@@ -379,7 +379,7 @@ std::vector<at::Tensor> BatchNorm_Backward_CUDA(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(input_.size(1)); dim3 blocks(input_.size(1));
dim3 threads(getNumThreads(input_.size(2))); dim3 threads(getNumThreads(input_.size(2)));
AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Inp_Backward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(input_.scalar_type(), "BatchNorm_Inp_Backward_CUDA", ([&] {
/* Device tensors */ /* Device tensors */
DeviceTensor<scalar_t, 3> gradoutput = devicetensor<scalar_t, 3>(gradoutput_); DeviceTensor<scalar_t, 3> gradoutput = devicetensor<scalar_t, 3>(gradoutput_);
DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_); DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
...@@ -411,7 +411,7 @@ std::vector<at::Tensor> Expectation_Forward_CUDA( ...@@ -411,7 +411,7 @@ std::vector<at::Tensor> Expectation_Forward_CUDA(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(input_.size(1)); dim3 blocks(input_.size(1));
dim3 threads(getNumThreads(input_.size(2))); dim3 threads(getNumThreads(input_.size(2)));
AT_DISPATCH_FLOATING_TYPES(input_.type(), "SumSquare_forward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(input_.scalar_type(), "SumSquare_forward_CUDA", ([&] {
scalar_t norm = scalar_t(1) / (input_.size(0) * input_.size(2)); scalar_t norm = scalar_t(1) / (input_.size(0) * input_.size(2));
/* Device tensors */ /* Device tensors */
DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_); DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
...@@ -435,7 +435,7 @@ at::Tensor Expectation_Backward_CUDA( ...@@ -435,7 +435,7 @@ at::Tensor Expectation_Backward_CUDA(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(input_.size(1)); dim3 blocks(input_.size(1));
dim3 threads(getNumThreads(input_.size(2))); dim3 threads(getNumThreads(input_.size(2)));
AT_DISPATCH_FLOATING_TYPES(input_.type(), "SumSquare_Backward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(input_.scalar_type(), "SumSquare_Backward_CUDA", ([&] {
scalar_t norm = scalar_t(1) / (input_.size(0) * input_.size(2)); scalar_t norm = scalar_t(1) / (input_.size(0) * input_.size(2));
/* Device tensors */ /* Device tensors */
DeviceTensor<scalar_t, 3> gradInput = devicetensor<scalar_t, 3>(gradInput_); DeviceTensor<scalar_t, 3> gradInput = devicetensor<scalar_t, 3>(gradInput_);
...@@ -467,7 +467,7 @@ at::Tensor Expectation_Inp_Backward_CUDA( ...@@ -467,7 +467,7 @@ at::Tensor Expectation_Inp_Backward_CUDA(
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(output_.size(1)); dim3 blocks(output_.size(1));
dim3 threads(getNumThreads(output_.size(2))); dim3 threads(getNumThreads(output_.size(2)));
AT_DISPATCH_FLOATING_TYPES(output_.type(), "SumSquare_Backward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(output_.scalar_type(), "SumSquare_Backward_CUDA", ([&] {
scalar_t norm = scalar_t(1) / (output_.size(0) * output_.size(2)); scalar_t norm = scalar_t(1) / (output_.size(0) * output_.size(2));
/* Device tensors */ /* Device tensors */
DeviceTensor<scalar_t, 3> gradInput = devicetensor<scalar_t, 3>(gradInput_); DeviceTensor<scalar_t, 3> gradInput = devicetensor<scalar_t, 3>(gradInput_);
......
- [Link to the EncNet CIFAR experiments and pre-trained models](http://hangzh.com/PyTorch-Encoding/experiments/cifar.html) - [Link to Docs](https://hangzhang.org/PyTorch-Encoding/model_zoo/imagenet.html)
- [Link to the Deep TEN experiments and pre-trained models](http://hangzh.com/PyTorch-Encoding/experiments/texture.html)
...@@ -10,10 +10,14 @@ ...@@ -10,10 +10,14 @@
import io import io
import os import os
import glob
import subprocess import subprocess
from setuptools import setup, find_packages from setuptools import setup, find_packages
import torch
from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
cwd = os.path.dirname(os.path.abspath(__file__)) cwd = os.path.dirname(os.path.abspath(__file__))
version = '1.2.2' version = '1.2.2'
...@@ -46,6 +50,55 @@ requirements = [ ...@@ -46,6 +50,55 @@ requirements = [
'requests', 'requests',
] ]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
cpu_extensions_dir = os.path.join(this_dir, "encoding", "lib", "cpu")
gpu_extensions_dir = os.path.join(this_dir, "encoding", "lib", "gpu")
source_cpu = glob.glob(os.path.join(cpu_extensions_dir, "*.cpp"))
source_cuda = glob.glob(os.path.join(gpu_extensions_dir, "*.cpp")) + \
glob.glob(os.path.join(gpu_extensions_dir, "*.cu"))
print('c++: ', source_cpu)
print('cuda: ', source_cuda)
sources = source_cpu
extra_compile_args = {"cxx": []}
include_dirs = [cpu_extensions_dir]
ext_modules = [
CppExtension(
"encoding.cpu",
source_cpu,
include_dirs=include_dirs,
extra_compile_args=extra_compile_args,
)
]
if CUDA_HOME is not None:
define_macros = [("WITH_CUDA", None)]
include_dirs += [gpu_extensions_dir]
extra_compile_args["nvcc"] = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
ext_modules.extend([
CUDAExtension(
"encoding.gpu",
source_cuda,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
])
return ext_modules
if __name__ == '__main__': if __name__ == '__main__':
create_version_file() create_version_file()
setup( setup(
...@@ -68,4 +121,6 @@ if __name__ == '__main__': ...@@ -68,4 +121,6 @@ if __name__ == '__main__':
'lib/gpu/*.cpp', 'lib/gpu/*.cpp',
'lib/gpu/*.cu', 'lib/gpu/*.cu',
]}, ]},
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment