Unverified Commit 07f25381 authored by Hang Zhang's avatar Hang Zhang Committed by GitHub
Browse files
parents cebf1341 70fdeb79
"""Dilated ResNet and DenseNet"""
from .resnet import *
from .densenet import *
"""Dilated DenseNet"""
from collections import OrderedDict
import torch
import torch.utils.model_zoo as model_zoo
from .. import nn
from .. import functions as F
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
model_urls = {
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
}
def densenet121(pretrained=False, **kwargs):
r"""Densenet-121 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet121']))
return model
def densenet169(pretrained=False, **kwargs):
r"""Densenet-169 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet169']))
return model
def densenet201(pretrained=False, **kwargs):
r"""Densenet-201 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet201']))
return model
def densenet161(pretrained=False, **kwargs):
r"""Densenet-161 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet161']))
return model
class _DenseLayer(nn.Sequential):
# pylint: disable=expression-not-assigned
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, dilation=1):
super(_DenseLayer, self).__init__()
self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
self.add_module('relu.1', nn.ReLU(inplace=True)),
self.add_module('conv.1', nn.Conv2d(
num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
self.add_module('relu.2', nn.ReLU(inplace=True)),
self.add_module('conv.2', nn.Conv2d(
bn_size * growth_rate, growth_rate, kernel_size=3, stride=1,
padding=dilation, dilation=dilation, bias=False)),
self.drop_rate = drop_rate
def forward(self, x):
new_features = super(_DenseLayer, self).forward(x)
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
return torch.cat([x, new_features], 1)
class _DenseBlock(nn.Sequential):
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, dilation=1):
super(_DenseBlock, self).__init__()
for i in range(num_layers):
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate, dilation=dilation)
self.add_module('denselayer%d' % (i + 1), layer)
class _Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features, stride, dilation=1):
super(_Transition, self).__init__()
self.add_module('norm', nn.BatchNorm2d(num_input_features))
self.add_module('relu', nn.ReLU(inplace=True))
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
kernel_size=1, stride=1, bias=False))
self.add_module('pool', nn.DilatedAvgPool2d(kernel_size=2, stride=stride,
dilation=dilation))
class DenseNet(nn.Module):
r"""Dilated DenseNet.
For correctly dilation of transition layer fo DenseNet, we implement the :class:`encoding.nn.DilatedAvgPool2d`.
Args:
growth_rate (int) - how many filters to add each layer (`k` in paper)
block_config (list of 4 ints) - how many layers in each pooling block
num_init_features (int) - the number of filters to learn in the first convolution layer
bn_size (int) - multiplicative factor for number of bottle neck layers
(i.e. bn_size * k features in the bottleneck layer)
drop_rate (float) - dropout rate after each dense layer
num_classes (int) - number of classification classes
Reference:
Huang, Gao, et al. "Densely Connected Convolutional Networks" *CVPR 2017*
"""
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
super(DenseNet, self).__init__()
# First convolution
self.features = nn.Sequential(OrderedDict([
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
('norm0', nn.BatchNorm2d(num_init_features)),
('relu0', nn.ReLU(inplace=True)),
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
]))
# Each denseblock
strides = [1, 2, 1, 1]
dilations = [1, 1, 2, 4]
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate,
dilation=dilations[i])
self.features.add_module('denseblock%d' % (i + 1), block)
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1:
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2, stride=strides[i+1], dilation=dilations[i])
self.features.add_module('transition%d' % (i + 1), trans)
num_features = num_features // 2
# Final batch norm
self.features.add_module('norm5', nn.BatchNorm2d(num_features))
# Linear layer
self.classifier = nn.Linear(num_features, num_classes)
def forward(self, x):
features = self.features(x)
out = F.relu(features, inplace=True)
# out = F.avg_pool2d(out, kernel_size=7).view(features.size(0), -1)
# out = self.classifier(out)
return out
......@@ -26,15 +26,15 @@ class BasicBlock(nn.Module):
"""ResNet BasicBlock
"""
expansion = 1
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1,
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, previous_dilation=1,
norm_layer=None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, bias=False)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=False)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
padding=first_dilation, dilation=first_dilation, bias=False)
padding=previous_dilation, dilation=previous_dilation, bias=False)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
......@@ -64,7 +64,7 @@ class Bottleneck(nn.Module):
# pylint: disable=unused-argument
expansion = 4
def __init__(self, inplanes, planes, stride=1, dilation=1,
downsample=None, first_dilation=1, norm_layer=None):
downsample=None, previous_dilation=1, norm_layer=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = norm_layer(planes)
......@@ -75,7 +75,7 @@ class Bottleneck(nn.Module):
self.conv3 = nn.Conv2d(
planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = norm_layer(planes * 4)
self.relu = nn.ReLU(inplace=False)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.dilation = dilation
self.stride = stride
......@@ -113,6 +113,21 @@ class Bottleneck(nn.Module):
class ResNet(nn.Module):
"""Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.
Parameters
----------
block : Block
Class for the residual block. Options are BasicBlockV1, BottleneckV1.
layers : list of int
Numbers of layers in each block
classes : int, default 1000
Number of classification classes.
dilated : bool, default False
Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
typically used in Semantic Segmentation.
norm_layer : object
Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
for Synchronized Cross-GPU BachNormalization).
Reference:
- He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
......@@ -120,18 +135,26 @@ class ResNet(nn.Module):
- Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
"""
# pylint: disable=unused-variable
def __init__(self, block, layers, num_classes=1000, norm_layer=None):
def __init__(self, block, layers, num_classes=1000, dilated=True, norm_layer=nn.BatchNorm2d):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = norm_layer(64)
self.relu = nn.ReLU(inplace=False)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer)
if dilated:
self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
dilation=2, norm_layer=norm_layer)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
dilation=4, norm_layer=norm_layer)
else:
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
norm_layer=norm_layer)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
norm_layer=norm_layer)
self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes)
......@@ -155,16 +178,16 @@ class ResNet(nn.Module):
layers = []
if dilation == 1 or dilation == 2:
layers.append(block(self.inplanes, planes, stride, dilation=1,
downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
elif dilation == 4:
layers.append(block(self.inplanes, planes, stride, dilation=2,
downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
else:
raise RuntimeError("=> unknown dilation size: {}".format(dilation))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation,
layers.append(block(self.inplanes, planes, dilation=dilation, previous_dilation=dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
......
"""Encoding Autograd Fuctions"""
from .encoding import *
from .syncbn import *
from .customize import *
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Encoding Customized Functions"""
import math
import torch
from torch.autograd import Function, Variable
from torch.nn.modules.utils import _pair
from .._ext import encoding_lib
__all__ = ['dilatedavgpool2d']
class _dilatedavgpool2d(Function):
@staticmethod
def forward(ctx, input, kernel_size, stride, padding,
dilation=1):
ctx.kH, ctx.kW = _pair(kernel_size)
ctx.dH, ctx.dW = _pair(stride if stride is not None else kernel_size)
ctx.padH, ctx.padW = _pair(padding)
ctx.dilationH, ctx.dilationW = _pair(dilation)
b, c, h, w = input.size()
if ctx.dH == 1 and ctx.dW == 1:
# keep the size for dilated avgpool
ow, oh = w, h
else:
ow = math.floor(float(w-ctx.kW+2*ctx.padW)/float(ctx.dW)) +1
oh = math.floor(float(h-ctx.kH+2*ctx.padH)/float(ctx.dH)) +1
with torch.cuda.device_of(input):
output = input.new(b, c, oh, ow)
ctx.save_for_backward(input)
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_DilatedAvgPool2d_Forward(
input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
ctx.padW, ctx.dilationH, ctx.dilationW)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_DilatedAvgPool2d_Forward(
input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
ctx.padW, ctx.dilationH, ctx.dilationW)
else:
raise RuntimeError('Unimplemented data type!')
return output
@staticmethod
def backward(ctx, gradOutput):
input, = ctx.saved_variables
with torch.cuda.device_of(input):
gradInput = Variable(input.data.new().resize_as_(input.data))
if isinstance(input.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Float_DilatedAvgPool2d_Backward(
gradInput.data, gradOutput.data,
ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
ctx.dilationH, ctx.dilationW)
elif isinstance(input.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Double_DilatedAvgPool2d_Backward(
gradInput.data, gradOutput.data,
ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
ctx.dilationH, ctx.dilationW)
else:
raise RuntimeError('Unimplemented data type!')
return gradInput, None, None, None, None
def dilatedavgpool2d(input, kernel_size, stride=None, padding=0,
dilation=1):
"""Dilated Average Pool 2d, for dilation of DenseNet.
Reference:
Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang,
Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
Applies 2D average-pooling operation in kh x kw regions by step size
dh x dw steps. The number of output features is equal to the number of
input planes.
See :class:`~encoding.nn.DilatedAvgPool2d` for details and output shape.
Args:
input: input tensor (minibatch x in_channels x iH x iW)
kernel_size: size of the pooling region, a single number or a
tuple (kh x kw)
stride: stride of the pooling operation, a single number or a
tuple (sh x sw). Default is equal to kernel size
padding: implicit zero padding on the input, a single number or
a tuple (padh x padw), Default: 0
dilation: the dilation parameter similar to Conv2d
"""
return _dilatedavgpool2d.apply(input, kernel_size, stride, padding, dilation)
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
## Email: zhanghang0704@gmail.com
## Copyright (c) 2018
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
......@@ -11,7 +10,7 @@
"""Functions for Encoding Layer"""
import torch
from torch.autograd import Function, Variable
from .._ext import encoding_lib
from .. import lib
__all__ = ['aggregate', 'scaledL2']
......@@ -20,47 +19,27 @@ class _aggregate(Function):
def forward(ctx, A, X, C):
# A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
ctx.save_for_backward(A, X, C)
B, _, K = A.size()
D = X.size(2)
with torch.cuda.device_of(A):
E = A.new(B, K, D)
if isinstance(A, torch.cuda.FloatTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregate_forward(E, A, X, C)
elif isinstance(A, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregate_forward(E, A, X, C)
if A.is_cuda:
E = lib.gpu.aggregate_forward(A, X, C)
else:
raise RuntimeError('Unimplemented data type!')
raise NotImplemented
return E
@staticmethod
def backward(ctx, gradE):
A, X, C = ctx.saved_variables
with torch.cuda.device_of(A):
gradA = Variable(A.data.new().resize_as_(A.data))
gradX = Variable(A.data.new().resize_as_(X.data))
gradC = Variable(A.data.new().resize_as_(C.data))
if isinstance(A.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(A.data):
encoding_lib.Encoding_Float_aggregate_backward(gradA.data, \
gradE.data, A.data, X.data, C.data)
elif isinstance(A.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A.data):
encoding_lib.Encoding_Double_aggregate_backward(gradA.data, \
gradE.data, A.data, X.data, C.data)
if A.is_cuda:
gradA, gradX, gradC = lib.gpu.aggregate_backward(gradE, A, X, C)
else:
raise RuntimeError('Unimplemented data type!')
gradX.data.copy_(torch.bmm(A, gradE).data)
gradC.data.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0).data)
raise NotImplemented
return gradA, gradX, gradC
def aggregate(A, X, C):
r"""
Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
r""" Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
to the codewords (:math:`C`) with assignment weights (:math:`A`).
.. math::
e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k)
Shape:
......@@ -77,53 +56,31 @@ def aggregate(A, X, C):
>>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True)
>>> func = encoding.aggregate()
>>> E = func(A, X, C)
"""
return _aggregate.apply(A, X, C)
class _scaledL2(Function):
@staticmethod
def forward(ctx, X, C, S):
B, N, _ = X.size()
K = C.size(0)
with torch.cuda.device_of(X):
SL = X.new(B, N, K)
if isinstance(X, torch.cuda.FloatTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
elif isinstance(X, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
if X.is_cuda:
SL = lib.gpu.scaled_l2_forward(X, C, S)
else:
raise RuntimeError('Unimplemented data type!')
raise NotImplemented
ctx.save_for_backward(X, C, S, SL)
return SL
@staticmethod
def backward(ctx, gradSL):
X, C, S, SL = ctx.saved_variables
K = C.size(0)
with torch.cuda.device_of(X.data):
gradX = Variable(X.data.new().resize_as_(X.data))
gradC = Variable(X.data.new().resize_as_(C.data))
gradS = Variable(X.data.new().resize_as_(S.data))
if isinstance(X.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(X.data):
encoding_lib.Encoding_Float_scaledl2_backward(gradSL.data, \
gradX.data, gradC.data, X.data, C.data, S.data)
elif isinstance(X.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X.data):
encoding_lib.Encoding_Double_scaledl2_backward(gradSL.data, \
gradX.data, gradC.data, X.data, C.data, S.data)
if X.is_cuda:
gradX, gradC, gradS = lib.gpu.scaled_l2_backward(gradSL, X, C, S, SL)
else:
raise RuntimeError('Unimplemented data type!')
gradS.data.copy_((gradSL*(SL/S.view(1, 1, K))).sum(0).sum(0).data)
raise NotImplemented
return gradX, gradC, gradS
def scaledL2(X, C, S):
r"""
scaledL2 distance
r""" scaledL2 distance
.. math::
sl_{ik} = s_k \|x_i-c_k\|^2
......@@ -134,6 +91,5 @@ def scaledL2(X, C, S):
(where :math:`B` is batch, :math:`N` is total number of features,
:math:`K` is number is codewords, :math:`D` is feature dimensions.)
- Output: :math:`E\in\mathcal{R}^{B\times N\times K}`
"""
return _scaledL2.apply(X, C, S)
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
## Email: zhanghang0704@gmail.com
## Copyright (c) 2018
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
......@@ -11,9 +10,9 @@
"""Synchronized Cross-GPU Batch Normalization functions"""
import torch
from torch.autograd import Variable, Function
from .._ext import encoding_lib
from .. import lib
__all__ = ['sum_square', 'batchnormtrain', 'batchnormeval']
__all__ = ['sum_square', 'batchnormtrain']
def sum_square(input):
r"""Calculate sum of elements and sum of squares for Batch Normalization"""
......@@ -24,91 +23,45 @@ class _sum_square(Function):
@staticmethod
def forward(ctx, input):
ctx.save_for_backward(input)
C = input.size(1)
with torch.cuda.device_of(input):
xsum = input.new().resize_(C).zero_()
xsquare = input.new().resize_(C).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_sum_square_Forward(
input, xsum, xsquare)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_sum_square_Forward(
input, xsum, xsquare)
if input.is_cuda:
xsum, xsqusum = lib.gpu.sumsquare_forward(input)
else:
raise RuntimeError('Unimplemented data type!', type(input))
return xsum, xsquare
raise NotImplemented
return xsum, xsqusum
@staticmethod
def backward(ctx, gradSum, gradSquare):
input, = ctx.saved_variables
with torch.cuda.device_of(input.data):
gradInput = Variable(input.data.new().resize_as_(input.data).zero_())
if isinstance(input.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Float_sum_square_Backward(
gradInput.data, input.data, gradSum.data, gradSquare.data)
elif isinstance(input.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Double_sum_square_Backward(
gradInput.data, input.data, gradSum.data, gradSquare.data)
if input.is_cuda:
gradInput = lib.gpu.sumsquare_backward(input, gradSum, gradSquare)
else:
raise RuntimeError('Unimplemented data type!')
raise NotImplemented
return gradInput
class _batchnorm(Function):
def __init__(self, training=False):
super(_batchnorm, self).__init__()
self.training = training
def forward(self, input, gamma, beta, mean, std):
self.save_for_backward(input, gamma, beta, mean, std)
assert(input.dim() == 3)
with torch.cuda.device_of(input):
invstd = 1.0 / std
output = input.new().resize_as_(input)
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_batchnorm_Forward(output, \
input, mean, invstd, gamma, beta)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_batchnorm_Forward(output, \
input, mean, invstd, gamma, beta)
class _batchnormtrain(Function):
@staticmethod
def forward(ctx, input, mean, std, gamma, beta):
ctx.save_for_backward(input, mean, std, gamma, beta)
if input.is_cuda:
output = lib.gpu.batchnorm_forward(input, mean, std, gamma, beta)
else:
raise RuntimeError('Unimplemented data type!')
raise NotImplemented
return output
def backward(self, gradOutput):
input, gamma, beta, mean, std = self.saved_tensors
invstd = 1.0 / std
with torch.cuda.device_of(input):
gradInput = gradOutput.new().resize_as_(input).zero_()
gradGamma = gradOutput.new().resize_as_(gamma).zero_()
gradBeta = gradOutput.new().resize_as_(beta).zero_()
gradMean = gradOutput.new().resize_as_(mean).zero_()
gradStd = gradOutput.new().resize_as_(std).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_batchnorm_Backward(
gradOutput, input, gradInput, gradGamma, gradBeta,
mean, invstd, gamma, beta, gradMean, gradStd,
self.training)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_batchnorm_Backward(
gradOutput, input, gradInput, gradGamma, gradBeta,
mean, invstd, gamma, beta, gradMean, gradStd,
self.training)
@staticmethod
def backward(ctx, gradOutput):
input, mean, std, gamma, beta = ctx.saved_variables
if gradOutput.is_cuda:
gradInput, gradMean, gradStd, gradGamma, gradBeta = \
lib.gpu.batchnorm_backward(gradOutput, input, mean,
std, gamma, beta, True)
else:
raise RuntimeError('Unimplemented data type!')
return gradInput, gradGamma, gradBeta, gradMean, gradStd
raise NotImplemented
return gradInput, gradMean, gradStd, gradGamma, gradBeta
def batchnormtrain(input, gamma, beta, mean, std):
def batchnormtrain(input, mean, std, gamma, beta):
r"""Applies Batch Normalization over a 3d input that is seen as a
mini-batch.
......@@ -123,14 +76,4 @@ def batchnormtrain(input, gamma, beta, mean, std):
- Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
"""
return _batchnorm(True)(input, gamma, beta, mean, std)
def batchnormeval(input, gamma, beta, mean, std):
r"""Applies Batch Normalization over a 3d input that is seen as a
mini-batch.
Please see encoding.batchnormtrain_
"""
return _batchnorm(False)(input, gamma, beta, mean, std)
return _batchnormtrain.apply(input, mean, std, gamma, beta)
// The maximum number of threads in a block
const int WARP_SIZE = 32;
const int MAX_BLOCK_SIZE = 512;
// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
static int getNumThreads(int nElem) {
int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
for (int i = 0; i != 5; ++i) {
if (nElem <= threadSizes[i]) {
return threadSizes[i];
}
}
return MAX_BLOCK_SIZE;
}
__device__ __forceinline__ int getMSB(int val) {
return 31 - __clz(val);
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/device_tensor.h"
#else
template <int Dim>
THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
if (!t) {
return THCDeviceTensor<real, Dim>();
}
int inDim = THCTensor_(nDimension)(state, t);
return toDeviceTensor<real, Dim>(state, t);
/*
if (inDim == Dim) {
return toDeviceTensor<real, Dim>(state, t);
}
// View in which the last dimensions are collapsed or expanded as needed
THAssert(THCTensor_(isContiguous)(state, t));
int size[Dim];
for (int i = 0; i < Dim || i < inDim; ++i) {
if (i < Dim && i < inDim) {
size[i] = t->size[i];
} else if (i < Dim) {
size[i] = 1;
} else {
size[Dim - 1] *= t->size[i];
}
}
return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
*/
}
struct Encoding_(Float2)
/*
* For reduce sum calcualtion of two elements
*/
{
real v1, v2;
__device__ Encoding_(Float2)() {}
__device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
__device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
__device__ Encoding_(Float2)(int v) : v1(v), v2(v) {}
__device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a)
{
v1 += a.v1;
v2 += a.v2;
return *this;
}
};
static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
#if CUDA_VERSION >= 9000
unsigned int mask = 0xffffffff;
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += __shfl_xor_sync(mask, val, 1 << i, WARP_SIZE);
}
#else
#if __CUDA_ARCH__ >= 300
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += __shfl_xor(val, 1 << i, WARP_SIZE);
}
#else
__shared__ real values[MAX_BLOCK_SIZE];
values[threadIdx.x] = val;
__threadfence_block();
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
for (int i = 1; i < WARP_SIZE; i++) {
val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
}
#endif
#endif
return val;
}
static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(
Encoding_(Float2) value)
{
value.v1 = Encoding_(rwarpSum)(value.v1);
value.v2 = Encoding_(rwarpSum)(value.v2);
return value;
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/encoding_kernel.c"
#else
__global__ void Encoding_(Aggregate_Forward_kernel) (
THCDeviceTensor<real, 3> E,
THCDeviceTensor<real, 3> A,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, N;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x;
k = blockIdx.y;
N = X.getSize(1);
/* main operation */
Encoding_(AggOp) g(A,X,C);
E[b][k][d] = Encoding_(reduce_agg)(g,b,k,d,N);
}
void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_,
THCTensor *A_, THCTensor *X_, THCTensor *C_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, E_, A_, X_, C_);
if (THCTensor_(nDimension)(state, E_) != 3 ||
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
// B, K, D
dim3 blocks(C.getSize(1), C.getSize(0), X.getSize(0));
// N
dim3 threads(getNumThreads(X.getSize(1)));
Encoding_(Aggregate_Forward_kernel)<<<blocks, threads, 0, stream>>>
(E, A, X, C);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(Aggregate_Backward_kernel) (
THCDeviceTensor<real, 3> GA,
THCDeviceTensor<real, 3> GE,
THCDeviceTensor<real, 3> A,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C)
/*
* aggregating backward kernel function
* G (dl/dR), L (dl/dE), A
*/
{
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y;
k = blockIdx.x;
D = GE.getSize(2);
/* main operation */
Encoding_(AggBackOp) g(GE,X,C);
GA[b][i][k] = Encoding_(reduce_aggback)(g,b,i,k,D);
}
void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_,
THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_)
/*
* aggregate backward to assignment weights
* G (dl/dR), L (dl/dE), A
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 5, GA_, GE_, A_, X_, C_);
if (THCTensor_(nDimension)(state, GA_) != 3 ||
THCTensor_(nDimension)(state, GE_) != 3 ||
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GA = devicetensor<3>(state, GA_);
THCDeviceTensor<real, 3> GE = devicetensor<3>(state, GE_);
THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
// B, K, D
dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
// N
dim3 threads(getNumThreads(C.getSize(1)));
Encoding_(Aggregate_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GA, GE, A, X, C);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(ScaledL2_Forward_kernel) (
THCDeviceTensor<real, 3> SL,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x;
i = blockIdx.y;
D = X.getSize(2);
/* main operation */
Encoding_(L2Op) g(X,C);
SL[b][i][k] = S[k] * Encoding_(reduce_sl2)(g,b,i,k,D);;
}
void Encoding_(ScaledL2_Forward)(
THCState *state, THCTensor *SL_, THCTensor *X_,
THCTensor *C_, THCTensor *S_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, SL_, X_, C_, S_);
if (THCTensor_(nDimension)(state, SL_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2 ||
THCTensor_(nDimension)(state, S_) != 1)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> SL = devicetensor<3>(state, SL_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
dim3 threads(getNumThreads(C.getSize(1)));
Encoding_(ScaledL2_Forward_kernel)<<<blocks, threads, 0, stream>>>
(SL, X, C, S);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(ScaledL2X_Backward_kernel) (
THCDeviceTensor<real, 3> GSL,
THCDeviceTensor<real, 3> GX,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
*/
{
/* declarations of the variables */
int b, d, i, K;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x;
i = blockIdx.y;
K = C.getSize(0);
/* main operation */
Encoding_(L2XBackOp) g(GSL,X,C,S);
GX[b][i][d] = Encoding_(reduce_sl2xback)(g,b,i,d,K);
}
__global__ void Encoding_(ScaledL2C_Backward_kernel) (
THCDeviceTensor<real, 3> GSL,
THCDeviceTensor<real, 2> GC,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
*/
{
/* declarations of the variables */
int k, d, B, N;
/* Get the index and channels */
d = blockIdx.x;
k = blockIdx.y;
B = X.getSize(0);
N = X.getSize(1);
/* main operation */
Encoding_(L2CBackOp) g(GSL,X,C,S);
GC[k][d] = Encoding_(reduce_sl2cback)(g,k,d,B,N);
}
void Encoding_(ScaledL2_Backward)(
THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
THCTensor *X_, THCTensor *C_, THCTensor *S_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, GSL_, GX_, GC_, X_, C_, S_);
if (THCTensor_(nDimension)(state, GSL_) != 3 ||
THCTensor_(nDimension)(state, GX_) != 3 ||
THCTensor_(nDimension)(state, GC_) != 2 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2 ||
THCTensor_(nDimension)(state, S_) != 1)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GSL = devicetensor<3>(state, GSL_);
THCDeviceTensor<real, 3> GX = devicetensor<3>(state, GX_);
THCDeviceTensor<real, 2> GC = devicetensor<2>(state, GC_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(X.getSize(2), X.getSize(1), X.getSize(0));
dim3 threads(getNumThreads(C.getSize(0)));
Encoding_(ScaledL2X_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GSL, GX, X, C, S);
THCudaCheck(cudaGetLastError());
dim3 blocks2(C.getSize(1), C.getSize(0));
dim3 threads2(getNumThreads(X.getSize(1)));
Encoding_(ScaledL2C_Backward_kernel)<<<blocks2, threads2, 0, stream>>>
(GSL, GC, X, C, S);
THCudaCheck(cudaGetLastError());
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/encoding_kernel.h"
#else
void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_,
THCTensor *A_, THCTensor *X_, THCTensor *C_);
void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_,
THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_);
void Encoding_(ScaledL2_Forward)( THCState *state, THCTensor *SL_,
THCTensor *X_, THCTensor *C_, THCTensor *S_);
void Encoding_(ScaledL2_Backward)(
THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
THCTensor *X_, THCTensor *C_, THCTensor *S_);
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/encoding_utils.c"
#else
struct Encoding_(AggOp) {
__device__ Encoding_(AggOp)(THCDeviceTensor<real, 3> a,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c)
: A(a), X(x), C(c) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> A;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
};
__device__ real Encoding_(reduce_agg)(
Encoding_(AggOp) op,
int b, int k, int d, int N)
{
real sum = 0;
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(b,x,k,d);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(AggBackOp) {
__device__ Encoding_(AggBackOp)(THCDeviceTensor<real, 3> ge,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c)
: GE(ge), X(x), C(c) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return GE[b][k][d].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> GE;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
};
__device__ real Encoding_(reduce_aggback)(
Encoding_(AggBackOp) op,
int b, int i, int k, int D)
{
real sum = 0;
for (int x = threadIdx.x; x < D; x += blockDim.x) {
sum += op(b,i,k,x);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(L2Op) {
__device__ Encoding_(L2Op)(THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c)
: X(x), C(c) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
real r = X[b][i][d].ldg() - C[k][d].ldg();
return r * r;
}
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
};
__device__ real Encoding_(reduce_sl2)(
Encoding_(L2Op) op,
int b, int i, int k, int D)
{
real sum = 0;
for (int x = threadIdx.x; x < D; x += blockDim.x) {
sum += op(b,i,k,x);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(L2XBackOp) {
__device__ Encoding_(L2XBackOp)(
THCDeviceTensor<real, 3> gsl,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c,
THCDeviceTensor<real, 1> s
) : GSL(gsl), X(x), C(c), S(s) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return 2*S[k].ldg() * GSL[b][i][k].ldg() *
(X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> GSL;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
THCDeviceTensor<real, 1> S;
};
__device__ real Encoding_(reduce_sl2xback)(
Encoding_(L2XBackOp) op,
int b, int i, int d, int K)
{
real sum = 0;
for (int x = threadIdx.x; x < K; x += blockDim.x) {
sum += op(b,i,x,d);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(L2CBackOp) {
__device__ Encoding_(L2CBackOp)(
THCDeviceTensor<real, 3> gsl,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c,
THCDeviceTensor<real, 1> s
) : GSL(gsl), X(x), C(c), S(s) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return -2*S[k].ldg() * GSL[b][i][k].ldg() *
(X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> GSL;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
THCDeviceTensor<real, 1> S;
};
__device__ real Encoding_(reduce_sl2cback)(
Encoding_(L2CBackOp) op,
int k, int d, int B, int N)
{
real sum = 0;
for (int batch = 0; batch < B; ++batch) {
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(batch,x,k,d);
}
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/pooling_kernel.c"
#else
__global__ void Encoding_(DilatedAvgPool_Forward_kernel) (
THCDeviceTensor<real, 4> X,
THCDeviceTensor<real, 4> Y,
int kH, int kW, int dH, int dW,
int padH, int padW, int dilationH, int dilationW
)
/*
* dilated avgpool2d forward kernel function
*/
{
/* declarations of the variables */
int bc, b, c, w, h, C;
real sum;
/* Get the index and channels */
bc = blockIdx.z;
w = blockIdx.x * blockDim.x + threadIdx.x;
h = blockIdx.y * blockDim.y + threadIdx.y;
C = Y.getSize(1);
b = bc / C;
c = bc - b*C;
/* boundary check for output */
if (w >= Y.getSize(3) || h >= Y.getSize(2)) return;
int hstart = h*dH -padH;
int wstart = w*dW -padW;
int hend = min(hstart + kH*dilationH, X.getSize(2));
int wend = min(wstart + kW*dilationW, X.getSize(3));
hstart = max(hstart, 0);
wstart = max(wstart, 0);
int pool_size = ((hend - hstart - 1) / dilationH + 1) *
((wend - wstart - 1) / dilationW + 1);
sum = 0;
for (int th=hstart; th < hend; th+=dilationH) {
for (int tw=wstart; tw < wend; tw+=dilationW) {
sum += X[b][c][th][tw];
}
}
Y[b][c][h][w] = sum / pool_size;
}
void Encoding_(DilatedAvgPool_Forward)(THCState *state,
THCTensor *X_, THCTensor *Y_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW)
/*
* dilated avgpool2d forward function
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, X_, Y_);
if (THCTensor_(nDimension)(state, X_) != 4 ||
THCTensor_(nDimension)(state, Y_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 4> X = devicetensor<4>(state, X_);
THCDeviceTensor<real, 4> Y = devicetensor<4>(state, Y_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(Y.getSize(3)/16+1, Y.getSize(2)/16+1,
Y.getSize(1)*Y.getSize(0));
Encoding_(DilatedAvgPool_Forward_kernel)<<<blocks, threads, 0, stream>>>
(X, Y, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
THCudaCheck(cudaGetLastError());
}
__global__ void Encoding_(DilatedAvgPool_Backward_kernel) (
THCDeviceTensor<real, 4> gradX,
THCDeviceTensor<real, 4> gradY,
int kH, int kW, int dH, int dW,
int padH, int padW, int dilationH, int dilationW
)
/*
* dilated avgpool2d forward kernel function
*/
{
/* declarations of the variables */
int bc, b, c, w, h, C;
real sum;
/* Get the index and channels */
bc = blockIdx.z;
w = blockIdx.x * blockDim.x + threadIdx.x;
h = blockIdx.y * blockDim.y + threadIdx.y;
C = gradX.getSize(1);
b = bc / C;
c = bc - b*C;
/* boundary check for output */
if (w >= gradX.getSize(3) || h >= gradX.getSize(2)) return;
int phstart = (h + padH < ((kH-1)*dilationH+1)) ? 0 :
(h + padH - ((kH-1)*dilationH+1))/dH + 1;
int pwstart = (w + padW < ((kW-1)*dilationW+1)) ? 0 :
(w + padW - ((kW-1)*dilationW+1))/dW + 1;
int phend = min((h+padH)/dH+1, gradY.getSize(2));
int pwend = min((w+padW)/dW+1, gradY.getSize(3));
sum = 0;
int hstart, wstart, hend, wend, pool_size;
for (int ph=phstart; ph < phend; ++ph) {
for (int pw=pwstart; pw < pwend; ++pw) {
hstart = ph*dW -padH;
wstart = pw*dW -padW;
hend = min(hstart + kH*dilationH, gradX.getSize(2));
wend = min(wstart + kW*dilationW, gradX.getSize(3));
hstart = max(hstart, 0);
wstart = max(wstart, 0);
pool_size = ((hend - hstart - 1) / dilationH + 1) *
((wend - wstart - 1) / dilationW + 1);
sum += gradY[b][c][ph][pw] / pool_size;
}
}
gradX[b][c][h][w] = sum;
}
void Encoding_(DilatedAvgPool_Backward)(THCState *state,
THCTensor *gradX_, THCTensor *gradY_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW)
/*
* dilated avgpool2d forward function
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, gradX_, gradY_);
if (THCTensor_(nDimension)(state, gradX_) != 4 ||
THCTensor_(nDimension)(state, gradY_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 4> gradX = devicetensor<4>(state, gradX_);
THCDeviceTensor<real, 4> gradY = devicetensor<4>(state, gradY_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(gradX.getSize(3)/16+1, gradX.getSize(2)/16+1,
gradX.getSize(1)*gradX.getSize(0));
Encoding_(DilatedAvgPool_Backward_kernel)<<<blocks, threads, 0, stream>>>
(gradX, gradY, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
THCudaCheck(cudaGetLastError());
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/pooling_kernel.h"
#else
void Encoding_(DilatedAvgPool_Forward)(THCState *state,
THCTensor *X_, THCTensor *Y_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
void Encoding_(DilatedAvgPool_Backward)(THCState *state,
THCTensor *gradX_, THCTensor *gradY_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/syncbn_kernel.c"
#else
__global__ void Encoding_(BatchNorm_Forward_kernel) (
THCDeviceTensor<real, 3> output,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> mean,
THCDeviceTensor<real, 1> invstd,
THCDeviceTensor<real, 1> gamma,
THCDeviceTensor<real, 1> beta)
{
int c = blockIdx.x;
/* main operation */
for (int b = 0; b < input.getSize(0); ++b) {
for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
real inp = input[b][c][x].ldg();
output[b][c][x] = gamma[c].ldg() * (inp - mean[c].ldg()) *
invstd[c].ldg() + beta[c].ldg();
}
}
}
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_)
/*
* batch norm forward function
* assuming the input is already flaghten
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, output_, input_, mean_, invstd_,
gamma_, beta_);
if (THCTensor_(nDimension)(state, output_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, mean_) != 1 ||
THCTensor_(nDimension)(state, invstd_) != 1 ||
THCTensor_(nDimension)(state, gamma_) != 1 ||
THCTensor_(nDimension)(state, beta_) != 1)
THError("BatchNorm2d forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> output = devicetensor<3>(state, output_);
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> mean = devicetensor<1>(state, mean_);
THCDeviceTensor<real, 1> invstd = devicetensor<1>(state, invstd_);
THCDeviceTensor<real, 1> gamma = devicetensor<1>(state, gamma_);
THCDeviceTensor<real, 1> beta = devicetensor<1>(state, beta_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(BatchNorm_Forward_kernel)<<<blocks, threads, 0, stream>>>(
output, input, mean, invstd, gamma, beta);
THCudaCheck(cudaGetLastError());
}
struct Encoding_(GradOp) {
__device__ Encoding_(GradOp)(real m, THCDeviceTensor<real, 3> i, THCDeviceTensor<real, 3> g)
: mean(m), input(i), gradOutput(g) {}
__device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
real g = gradOutput[batch][plane][n].ldg();
real c = input[batch][plane][n].ldg() - mean;
return Encoding_(Float2)(g, g * c);
}
real mean;
THCDeviceTensor<real, 3> input;
THCDeviceTensor<real, 3> gradOutput;
};
// Sum across (batch, b/c/n) applying Op() pointwise
__device__ Encoding_(Float2) Encoding_(reduce)(
Encoding_(GradOp) op,
THCDeviceTensor<real, 3> tensor,
int plane)
{
Encoding_(Float2) sum = (Encoding_(Float2))0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = Encoding_(warpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ Encoding_(Float2) shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (Encoding_(Float2))0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(warpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
__global__ void Encoding_(BatchNorm_Backward_kernel) (
THCDeviceTensor<real, 3> gradoutput,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 3> gradinput,
THCDeviceTensor<real, 1> gradgamma,
THCDeviceTensor<real, 1> gradbeta,
THCDeviceTensor<real, 1> mean,
THCDeviceTensor<real, 1> invstd,
THCDeviceTensor<real, 1> gamma,
THCDeviceTensor<real, 1> beta,
THCDeviceTensor<real, 1> gradMean,
THCDeviceTensor<real, 1> gradStd,
int train)
{
/* declarations of the variables */
/* Get the index and channels */
int c = blockIdx.x;
/* main operation */
Encoding_(GradOp) g(mean[c], input, gradoutput);
Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
real gradOutputSum = res.v1;
real dotP = res.v2;
//real projScale = dotP * norm * invstd[c].ldg() * invstd[c].ldg();
real gradScale = invstd[c].ldg() * gamma[c].ldg();
if (train && threadIdx.x == 0) {
gradMean[c] = - gradOutputSum * gamma[c].ldg() * invstd[c].ldg();
gradStd[c] = - dotP * gamma[c].ldg() * invstd[c].ldg() * invstd[c].ldg();
}
if (gradinput.numElements() > 0) {
for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
gradinput[batch][c][x] = gradoutput[batch][c][x].ldg() * gradScale;
}
}
}
if (gradgamma.numElements() > 0) {
if (threadIdx.x == 0) {
gradgamma[c] += dotP * invstd[c].ldg();
}
}
if (gradbeta.numElements() > 0) {
if (threadIdx.x == 0) {
gradbeta[c] += gradOutputSum;
}
}
}
void Encoding_(BatchNorm_Backward)(THCState *state,
THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_,
THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_,
THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_,
THCTensor *gradMean_, THCTensor *gradStd_, int train)
/*
* batch norm backward function
* assuming the input is already flaghten
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, gradoutput_, input_, gradinput_,
gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_);
if (THCTensor_(nDimension)(state, gradoutput_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, gradinput_) != 3 ||
THCTensor_(nDimension)(state, gradgamma_) != 1 ||
THCTensor_(nDimension)(state, gradbeta_) != 1 ||
THCTensor_(nDimension)(state, mean_) != 1 ||
THCTensor_(nDimension)(state, invstd_) != 1 ||
THCTensor_(nDimension)(state, gamma_) != 1 ||
THCTensor_(nDimension)(state, beta_) != 1 ||
THCTensor_(nDimension)(state, gradMean_) != 1 ||
THCTensor_(nDimension)(state, gradStd_) != 1 )
THError("BatchNorm2d backward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> gradoutput =
devicetensor<3>(state, gradoutput_);
THCDeviceTensor<real, 3> input =
devicetensor<3>(state, input_);
THCDeviceTensor<real, 3> gradinput =
devicetensor<3>(state, gradinput_);
THCDeviceTensor<real, 1> gradgamma =
devicetensor<1>(state, gradgamma_);
THCDeviceTensor<real, 1> gradbeta = devicetensor<1>(state, gradbeta_);
THCDeviceTensor<real, 1> mean = devicetensor<1>(state, mean_);
THCDeviceTensor<real, 1> invstd = devicetensor<1>(state, invstd_);
THCDeviceTensor<real, 1> gamma = devicetensor<1>(state, gamma_);
THCDeviceTensor<real, 1> beta = devicetensor<1>(state, beta_);
THCDeviceTensor<real, 1> gradMean = devicetensor<1>(state, gradMean_);
THCDeviceTensor<real, 1> gradStd = devicetensor<1>(state, gradStd_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(BatchNorm_Backward_kernel)<<<blocks, threads, 0, stream>>>(
gradoutput, input, gradinput, gradgamma, gradbeta, mean, invstd,
gamma, beta, gradMean, gradStd, train);
THCudaCheck(cudaGetLastError());
}
struct Encoding_(SumOp) {
__device__ Encoding_(SumOp)(THCDeviceTensor<real, 3> i)
: input(i){}
__device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
real g = input[batch][plane][n].ldg();
return Encoding_(Float2)(g, g * g);
}
real mean;
THCDeviceTensor<real, 3> input;
};
// Sum across (batch, x/y/z) applying Op() pointwise
__device__ Encoding_(Float2) Encoding_(reduce_sum)(Encoding_(SumOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
Encoding_(Float2) sum = (Encoding_(Float2))0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = Encoding_(warpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ Encoding_(Float2) shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (Encoding_(Float2))0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(warpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
__global__ void Encoding_(Sum_Square_Forward_kernel) (
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> sum,
THCDeviceTensor<real, 1> square)
{
int c = blockIdx.x;
/* main operation */
Encoding_(SumOp) g(input);
Encoding_(Float2) res = Encoding_(reduce_sum)(g, input, c);
real xsum = res.v1;
real xsquare = res.v2;
if (threadIdx.x == 0) {
sum[c] = xsum;
square[c] = xsquare;
}
}
void Encoding_(Sum_Square_Forward)(THCState *state,
THCTensor *input_, THCTensor *sum_, THCTensor *square_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 3, input_, sum_, square_);
if (THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, sum_) != 1 ||
THCTensor_(nDimension)(state, square_) != 1)
THError("Sum_Square forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> sum = devicetensor<1>(state, sum_);
THCDeviceTensor<real, 1> square = devicetensor<1>(state, square_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(Sum_Square_Forward_kernel)<<<blocks, threads, 0, stream>>>(
input, sum, square);
THCudaCheck(cudaGetLastError());
}
__global__ void Encoding_(Sum_Square_Backward_kernel) (
THCDeviceTensor<real, 3> gradInput,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> gradSum,
THCDeviceTensor<real, 1> gradSquare)
{
int c = blockIdx.x;
/* main operation */
for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
{
gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
input[batch][c][x];
}
}
}
void Encoding_(Sum_Square_Backward)(THCState *state,
THCTensor *gradInput_, THCTensor *input_,
THCTensor *gradSum_, THCTensor *gradSquare_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, gradInput_, input_, gradSum_,
gradSquare_);
if (THCTensor_(nDimension)(state, gradInput_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, gradSum_) != 1 ||
THCTensor_(nDimension)(state, gradSquare_) != 1)
THError("Sum_Square forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> gradInput = devicetensor<3>(state, gradInput_);
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> gradSum = devicetensor<1>(state, gradSum_);
THCDeviceTensor<real, 1> gradSquare =devicetensor<1>(state, gradSquare_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(Sum_Square_Backward_kernel)<<<blocks, threads, 0, stream>>>(
gradInput, input, gradSum, gradSquare);
THCudaCheck(cudaGetLastError());
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/syncbn_kernel.h"
#else
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_);
void Encoding_(BatchNorm_Backward)(THCState *state,
THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_,
THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_,
THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_,
THCTensor *gradMean_, THCTensor *gradStd_, int train);
void Encoding_(Sum_Square_Forward)(THCState *state,
THCTensor *input_, THCTensor *sum_, THCTensor *square_);
void Encoding_(Sum_Square_Backward)(THCState *state,
THCTensor *gradInput, THCTensor *input_,
THCTensor *gradSum_, THCTensor *gradSquare_);
#endif
Make a copy from PyTorch lib to make the compilation easier for users, due to so many questions and requests.
#include <assert.h>
namespace detail {
template <typename T, int N>
__host__ __device__ void copy(T to[N], T from[N]) {
for (int i = 0; i < N; ++i) {
to[i] = from[i];
}
}
} // namespace detail
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor()
: data_(NULL) {
thc_static_assert(Dim > 0);
for (int i = 0; i < Dim; ++i) {
size_[i] = 0;
stride_[i] = (IndexT) 1;
}
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::
#ifdef _MSC_VER
THCDeviceTensor(DataPtrType data, const IndexT (&sizes)[Dim])
#else
THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim])
#endif
: data_(data) {
thc_static_assert(Dim > 0);
for (int i = 0; i < Dim; ++i) {
size_[i] = sizes[i];
}
stride_[Dim - 1] = (IndexT) 1;
for (int i = Dim - 2; i >= 0; --i) {
stride_[i] = stride_[i + 1] * sizes[i + 1];
}
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor(
#ifdef _MSC_VER
DataPtrType data, const IndexT (&sizes)[Dim], const IndexT (&strides)[Dim])
#else
DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
#endif
: data_(data) {
thc_static_assert(Dim > 0);
for (int i = 0; i < Dim; ++i) {
size_[i] = sizes[i];
stride_[i] = strides[i];
}
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int OtherDim>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isSameSizeAndStride(
const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const {
if (Dim != OtherDim) {
return false;
}
for (int i = 0; i < Dim; ++i) {
if (size_[i] != rhs.size_[i]) {
return false;
}
if (stride_[i] != rhs.stride_[i]) {
return false;
}
}
return true;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() {
thc_static_assert(sizeof(U) == sizeof(T));
return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), size_, stride_);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ const THCDeviceTensor<U, Dim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
thc_static_assert(sizeof(U) == sizeof(T));
return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), size_, stride_);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ ptrdiff_t
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
ptrdiff_t size = getSize(0);
for (int i = 1; i < Dim; ++i) {
size *= getSize(i);
}
return size;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguous() const {
return isContiguousRange(0, Dim);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized(int i) const {
if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
return true;
} else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
((getStride(i - 1) / getStride(i)) >= getSize(i))) {
return true;
}
return false;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized() const {
for (int i = 0; i < Dim; ++i) {
if (!isConsistentlySized(i)) {
return false;
}
}
return true;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguousRange(
int first, int last) const {
int64_t prevSize = last < Dim ? getStride(last) * getSize(last) : 1;
for (int i = last - 1; i >= first; --i) {
if (getSize(i) != (IndexT) 1) {
if (getStride(i) == prevSize) {
prevSize *= getSize(i);
} else {
return false;
}
}
}
return true;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::transpose(int dim1,
int dim2) const {
#ifdef __CUDA_ARCH__
// Device code
assert(dim1 >= 0 && dim1 < Dim);
assert(dim1 >= 0 && dim2 < Dim);
#else
// Host code
if (dim1 < 0 || dim1 >= Dim) {
THError("dim1 out of bounds");
}
if (dim2 < 0 || dim2 >= Dim) {
THError("dim2 out of bounds");
}
#endif
IndexT newSize[Dim];
IndexT newStride[Dim];
for (int i = 0; i < Dim; ++i) {
newSize[i] = size_[i];
newStride[i] = stride_[i];
}
IndexT tmp = newSize[dim1];
newSize[dim1] = newSize[dim2];
newSize[dim2] = tmp;
tmp = newStride[dim1];
newStride[dim1] = newStride[dim2];
newStride[dim2] = tmp;
return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastOuter() {
// Can only create tensors of greater dimension
thc_static_assert(NewDim > Dim);
IndexT newSize[NewDim];
IndexT newStride[NewDim];
int shift = NewDim - Dim;
for (int i = 0; i < NewDim; ++i) {
if (i < shift) {
// These are the extended dimensions
newSize[i] = (IndexT) 1;
newStride[i] = size_[0] * stride_[0];
} else {
// Shift the remaining dimensions
newSize[i] = size_[i - shift];
newStride[i] = stride_[i - shift];
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastInner() {
// Can only create tensors of greater dimension
thc_static_assert(NewDim > Dim);
IndexT newSize[NewDim];
IndexT newStride[NewDim];
for (int i = 0; i < NewDim; ++i) {
if (i < Dim) {
// Existing dimensions get copied over
newSize[i] = size_[i];
newStride[i] = stride_[i];
} else {
// Extended dimensions
newSize[i] = (IndexT) 1;
newStride[i] = (IndexT) 1;
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastOuter() {
// Can only create tensors of lesser dimension
thc_static_assert(NewDim < Dim);
// We can't downcast non-contiguous tensors, since it leaves
// garbage data in the tensor. The tensor needs to be contiguous
// in all of the dimensions we are collapsing (no padding in
// them).
bool cont = isContiguousRange(0, Dim - NewDim);
#ifdef __CUDA_ARCH__
// Device code
assert(cont);
#else
// Host code
if (!cont) {
THError("Can only downcast contiguous tensors");
}
#endif
IndexT newSize[NewDim];
IndexT newStride[NewDim];
int ignoredDims = Dim - NewDim;
IndexT collapsedSize = 1;
for (int i = 0; i < Dim; ++i) {
if (i < ignoredDims) {
// Collapse these dimensions
collapsedSize *= getSize(i);
} else {
// Non-collapsed dimensions
if (i == ignoredDims) {
// This is the first non-collapsed dimension
newSize[i - ignoredDims] = collapsedSize * getSize(i);
} else {
// Subsequent non-collapsed dimensions
newSize[i - ignoredDims] = getSize(i);
}
newStride[i - ignoredDims] = getStride(i);
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastInner() {
// Can only create tensors of lesser dimension
thc_static_assert(NewDim < Dim);
// We can't downcast non-contiguous tensors, since it leaves
// garbage data in the tensor. The tensor needs to be contiguous
// in all of the dimensions we are collapsing (no padding in
// them).
bool cont = isContiguousRange(NewDim, Dim);
#ifdef __CUDA_ARCH__
// Device code
assert(cont);
#else
// Host code
if (!cont) {
THError("Can only downcast contiguous tensors");
}
#endif
IndexT newSize[NewDim];
IndexT newStride[NewDim];
IndexT collapsedSize = 1;
for (int i = Dim - 1; i >= 0; --i) {
if (i >= NewDim) {
// Collapse these dimensions
collapsedSize *= getSize(i);
} else {
// Non-collapsed dimensions
if (i == NewDim - 1) {
// This is the first non-collapsed dimension
newSize[i] = collapsedSize * getSize(i);
newStride[i] = getStride(Dim - 1);
} else {
// Subsequent non-collapsed dimensions
newSize[i] = getSize(i);
newStride[i] = getStride(i);
}
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view(DataPtrType at) {
thc_static_assert(SubDim >= 1 && SubDim < Dim);
IndexT viewSizes[SubDim];
IndexT viewStrides[SubDim];
for (int i = 0; i < SubDim; ++i) {
viewSizes[i] = size_[Dim - SubDim + i];
viewStrides[i] = stride_[Dim - SubDim + i];
}
return THCDeviceTensor<T, SubDim, IndexT, PtrTraits>(
at, viewSizes, viewStrides);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view() {
return view<SubDim>(data_);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
void
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::zero(cudaStream_t stream) {
#ifdef __CUDA_ARCH__
assert(isContiguous());
#else
if (!isContiguous()) {
THError("fillAsync only works on contiguous data");
}
#endif
cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream);
}
#ifndef THC_DEVICE_TENSOR_INC
#define THC_DEVICE_TENSOR_INC
#include <cuda.h>
#include <cuda_runtime.h>
// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
template <bool>
struct THCStaticAssert;
template <>
struct THCStaticAssert<true> {
};
#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())
/// Our tensor type
template <typename T,
int Dim,
typename IndexT,
template <typename U> class PtrTraits>
class THCDeviceTensor;
/// Type of a subspace of a tensor
namespace detail {
template <typename TensorType,
int SubDim,
template <typename U> class PtrTraits>
class THCDeviceSubTensor;
}
template <typename T>
struct RestrictPtrTraits {
typedef T* __restrict__ PtrType;
};
template <typename T>
struct DefaultPtrTraits {
typedef T* PtrType;
};
/**
Templated multi-dimensional array that supports strided access of
elements. Main access is through `operator[]`; e.g.,
`tensor[x][y][z]`.
- `T` is the contained type (e.g., `float`)
- `Dim` is the tensor rank
- `IndexT` is the integer type used for size/stride arrays, and for
- all indexing math. Default is `int`, but for large tensors, `int64_t`
- can be used instead.
- `PtrTraits` are traits applied to our data pointer (T*). By default,
- this is just T*, but RestrictPtrTraits can be used to apply T*
- __restrict__ for alias-free analysis.
*/
template <typename T,
int Dim,
typename IndexT = int,
template <typename U> class PtrTraits = DefaultPtrTraits>
class THCDeviceTensor {
public:
enum { NumDim = Dim };
typedef T DataType;
typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType;
typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;
/// Default constructor
__host__ __device__ THCDeviceTensor();
/// Constructor that calculates strides with no padding
__host__ __device__ THCDeviceTensor(DataPtrType data,
#ifdef _MSC_VER
const IndexT (&sizes)[Dim]);
#else
const IndexT sizes[Dim]);
#endif
/// Constructor that takes arbitrary size/stride arrays
__host__ __device__ THCDeviceTensor(DataPtrType data,
#ifdef _MSC_VER
const IndexT (&sizes)[Dim],
const IndexT (&strides)[Dim]);
#else
const IndexT sizes[Dim],
const IndexT strides[Dim]);
#endif
/// Returns true if the two tensors are of the same dimensionality,
/// size and stride.
template <int OtherDim>
__host__ __device__ bool
isSameSizeAndStride(
const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;
/// Cast to a tensor of a different type of the same size and stride
template <typename U>
__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();
/// Const version of `cast`
template <typename U>
__host__ __device__
const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;
/// Returns a raw pointer to the start of our data.
__host__ __device__ __forceinline__ DataPtrType data() {
return data_;
}
/// Returns a raw pointer to the start of our data (const).
__host__ __device__ __forceinline__
const DataPtrType data() const {
return data_;
}
/// Cast to a different datatype
template <typename U>
__host__ __device__ __forceinline__
typename PtrTraits<U>::PtrType dataAs() {
return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
}
/// Cast to a different datatype
template <typename U>
__host__ __device__ __forceinline__
const typename PtrTraits<const U>::PtrType dataAs() const {
return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
}
/// Returns a read/write view of a portion of our tensor.
__host__ __device__ __forceinline__
detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
operator[](IndexT);
/// Returns a read/write view of a portion of our tensor (const).
__host__ __device__ __forceinline__
const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
operator[](IndexT) const;
/// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
/// checking.
__host__ __device__ __forceinline__ int getSize(int i) const {
return size_[i];
}
/// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
/// checking.
__host__ __device__ __forceinline__ int getStride(int i) const {
return stride_[i];
}
/// Returns the total number of elements contained within our data
/// (product of `getSize(i)`)
__host__ __device__ ptrdiff_t numElements() const;
/// Returns the size array.
__host__ __device__ __forceinline__ const IndexT* sizes() const {
return size_;
}
/// Returns the stride array.
__host__ __device__ __forceinline__ const IndexT* strides() const {
return stride_;
}
/// Returns true if there is no padding within the tensor and no
/// re-ordering of the dimensions.
/// ~~~
/// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
/// ~~~
__host__ __device__ bool isContiguous() const;
/// Returns whether a given dimension has only increasing stride
/// from the previous dimension. A tensor that was permuted by
/// exchanging size and stride only will fail this check.
/// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
__host__ __device__ bool isConsistentlySized(int i) const;
// Returns whether at each dimension `stride <= size`.
// If this is not the case then iterating once over the size space will
// touch the same memory locations multiple times.
__host__ __device__ bool isConsistentlySized() const;
/// Returns true if the given dimension range [first, last) has no padding.
__host__ __device__ bool isContiguousRange(int first, int last) const;
/// Returns a tensor of the same dimension after transposing the two
/// dimensions given. Does not actually move elements; transposition
/// is made by permuting the size/stride arrays.
/// If the dimensions are not valid, asserts.
__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
transpose(int dim1, int dim2) const;
/// Upcast a tensor of dimension `D` to some tensor of dimension
/// D' > D by padding the leading dimensions by 1
/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
upcastOuter();
/// Upcast a tensor of dimension `D` to some tensor of dimension
/// D' > D by padding the lowest/most varying dimensions by 1
/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
upcastInner();
/// Downcast a tensor of dimension `D` to some tensor of dimension
/// D' < D by collapsing the leading dimensions. asserts if there is
/// padding on the leading dimensions.
template <int NewDim>
__host__ __device__
THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();
/// Downcast a tensor of dimension `D` to some tensor of dimension
/// D' < D by collapsing the leading dimensions. asserts if there is
/// padding on the leading dimensions.
template <int NewDim>
__host__ __device__
THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();
/// Returns a tensor that is a view of the `SubDim`-dimensional slice
/// of this tensor, starting at `at`.
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
view(DataPtrType at);
/// Returns a tensor that is a view of the `SubDim`-dimensional slice
/// of this tensor, starting where our data begins
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
view();
/// Zeroes out the tensor asynchronously. Asserts if the contents
/// in question are not contiguous.
void zero(cudaStream_t stream = 0);
private:
/// Raw pointer to where the tensor data begins
DataPtrType data_;
/// Array of strides (in sizeof(T) terms) per each dimension
IndexT stride_[Dim];
/// Size per each dimension
IndexT size_[Dim];
};
namespace detail {
/// Specialization for a view of a single value (0-dimensional)
template <typename TensorType, template <typename U> class PtrTraits>
class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
public:
__host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
operator=(typename TensorType::DataType val) {
*data_ = val;
return *this;
}
// operator T&
__host__ __device__ operator typename TensorType::DataType&() {
return *data_;
}
// const operator T& returning const T&
__host__ __device__ operator const typename TensorType::DataType&() const {
return *data_;
}
// operator& returning T*
__host__ __device__ typename TensorType::DataType* operator&() {
return data_;
}
// const operator& returning const T*
__host__ __device__ const typename TensorType::DataType* operator&() const {
return data_;
}
/// Returns a raw accessor to our slice.
__host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
return data_;
}
/// Returns a raw accessor to our slice (const).
__host__ __device__ __forceinline__
const typename TensorType::DataPtrType data() const {
return data_;
}
/// Cast to a different datatype.
template <typename T>
__host__ __device__ T& as() {
return *dataAs<T>();
}
/// Cast to a different datatype (const).
template <typename T>
__host__ __device__ const T& as() const {
return *dataAs<T>();
}
/// Cast to a different datatype
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<T>::PtrType dataAs() {
return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
}
/// Cast to a different datatype (const)
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<const T>::PtrType dataAs() const {
return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
}
/// Use the texture cache for reads
__device__ __forceinline__ typename TensorType::DataType ldg() const {
#if __CUDA_ARCH__ >= 350
return __ldg(data_);
#else
return *data_;
#endif
}
/// Use the texture cache for reads; cast as a particular type
template <typename T>
__device__ __forceinline__ T ldgAs() const {
#if __CUDA_ARCH__ >= 350
return __ldg(dataAs<T>());
#else
return as<T>();
#endif
}
private:
/// One dimension greater can create us
friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;
/// Our parent tensor can create us
friend class THCDeviceTensor<typename TensorType::DataType,
1,
typename TensorType::IndexType,
PtrTraits>;
__host__ __device__ __forceinline__ THCDeviceSubTensor(
TensorType& t,
typename TensorType::DataPtrType data)
: tensor_(t),
data_(data) {
}
/// The tensor we're referencing
TensorType& tensor_;
/// Where our value is located
typename TensorType::DataPtrType const data_;
};
/// A `SubDim`-rank slice of a parent THCDeviceTensor
template <typename TensorType,
int SubDim,
template <typename U> class PtrTraits>
class THCDeviceSubTensor {
public:
/// Returns a view of the data located at our offset (the dimension
/// `SubDim` - 1 tensor).
__host__ __device__ __forceinline__
THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
operator[](typename TensorType::IndexType index) {
return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
tensor_,
data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
}
/// Returns a view of the data located at our offset (the dimension
/// `SubDim` - 1 tensor) (const).
__host__ __device__ __forceinline__
const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
operator[](typename TensorType::IndexType index) const {
return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
tensor_,
data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
}
// operator& returning T*
__host__ __device__ typename TensorType::DataType* operator&() {
return data_;
}
// const operator& returning const T*
__host__ __device__ const typename TensorType::DataType* operator&() const {
return data_;
}
/// Returns a raw accessor to our slice.
__host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
return data_;
}
/// Returns a raw accessor to our slice (const).
__host__ __device__ __forceinline__
const typename TensorType::DataPtrType data() const {
return data_;
}
/// Cast to a different datatype.
template <typename T>
__host__ __device__ T& as() {
return *dataAs<T>();
}
/// Cast to a different datatype (const).
template <typename T>
__host__ __device__ const T& as() const {
return *dataAs<T>();
}
/// Cast to a different datatype
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<T>::PtrType dataAs() {
return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
}
/// Cast to a different datatype (const)
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<const T>::PtrType dataAs() const {
return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
}
/// Use the texture cache for reads
__device__ __forceinline__ typename TensorType::DataType ldg() const {
#if __CUDA_ARCH__ >= 350
return __ldg(data_);
#else
return *data_;
#endif
}
/// Use the texture cache for reads; cast as a particular type
template <typename T>
__device__ __forceinline__ T ldgAs() const {
#if __CUDA_ARCH__ >= 350
return __ldg(dataAs<T>());
#else
return as<T>();
#endif
}
/// Returns a tensor that is a view of the SubDim-dimensional slice
/// of this tensor, starting where our data begins
THCDeviceTensor<typename TensorType::DataType,
SubDim,
typename TensorType::IndexType,
PtrTraits> view() {
return tensor_.template view<SubDim>(data_);
}
private:
/// One dimension greater can create us
friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;
/// Our parent tensor can create us
friend class
THCDeviceTensor<typename TensorType::DataType,
TensorType::NumDim,
typename TensorType::IndexType,
PtrTraits>;
__host__ __device__ __forceinline__ THCDeviceSubTensor(
TensorType& t,
typename TensorType::DataPtrType data)
: tensor_(t),
data_(data) {
}
/// The tensor we're referencing
TensorType& tensor_;
/// The start of our sub-region
typename TensorType::DataPtrType const data_;
};
} // namespace detail
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __forceinline__
detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
Dim - 1, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
*this, data_)[index]);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __forceinline__
const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
Dim - 1, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
const_cast<TensorType&>(*this), data_)[index]);
}
#include "THCDeviceTensor-inl.cuh"
#endif // THC_DEVICE_TENSOR_INC
namespace detail {
// Add a layer of SFINAE to support static_assert
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct UpcastTHCRoot {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t);
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct UpcastTHC :
UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
};
// Never instantiated SFINAE purposes only
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true> {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t) {
thc_static_assert(NewDim > Dim);
return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
template upcastOuter<NewDim>();
}
};
// Add a layer of SFINAE to support static_assert
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct DowncastTHCRoot {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t);
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct DowncastTHC :
DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
};
// Never instantiated SFINAE purposes only
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true> {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t) {
thc_static_assert(NewDim < Dim);
return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
template downcastOuter<NewDim>();
}
};
} // namespace detail
#define SWITCH_UNROLL_CUDA_CAST_FACTORY(i) \
case i: \
if (NewDim > i) { \
return detail::UpcastTHC<T, i, IndexT, \
PtrTraits, NewDim, (NewDim > i)>:: \
make(state, t); \
} else if (NewDim == i) { \
return toDeviceTensor<T, NewDim, IndexT, PtrTraits>(state, t); \
} else { \
return detail::DowncastTHC<T, i, IndexT, \
PtrTraits, NewDim, (NewDim < i)>:: \
make(state, t); \
} \
/* break; */
template <typename T, int NewDim,
typename IndexT, template <typename U> class PtrTraits>
THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
toDeviceTensorCast(THCState* state, THCudaTensor* t) {
switch (THCudaTensor_nDimension(state, t)) {
SWITCH_UNROLL_CUDA_CAST_FACTORY(1);
SWITCH_UNROLL_CUDA_CAST_FACTORY(2);
SWITCH_UNROLL_CUDA_CAST_FACTORY(3);
SWITCH_UNROLL_CUDA_CAST_FACTORY(4);
SWITCH_UNROLL_CUDA_CAST_FACTORY(5);
SWITCH_UNROLL_CUDA_CAST_FACTORY(6);
SWITCH_UNROLL_CUDA_CAST_FACTORY(7);
SWITCH_UNROLL_CUDA_CAST_FACTORY(8);
SWITCH_UNROLL_CUDA_CAST_FACTORY(9);
SWITCH_UNROLL_CUDA_CAST_FACTORY(10);
default:
;
}
// Not implemented
THError("THCDeviceTensor dimension size not supported");
return NULL; /* never enters this piece, appeasing compiler warnings */
}
#undef SWITCH_UNROLL_CUDA_CAST_FACTORY
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment