"vscode:/vscode.git/clone" did not exist on "aee4f523cfd92f844208118e42dcc6bfeb271d08"
Unverified Commit 07f25381 authored by Hang Zhang's avatar Hang Zhang Committed by GitHub
Browse files
parents cebf1341 70fdeb79
"""Dilated ResNet and DenseNet""" """Dilated ResNet and DenseNet"""
from .resnet import * from .resnet import *
from .densenet import *
"""Dilated DenseNet"""
from collections import OrderedDict
import torch
import torch.utils.model_zoo as model_zoo
from .. import nn
from .. import functions as F
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
model_urls = {
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
}
def densenet121(pretrained=False, **kwargs):
r"""Densenet-121 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet121']))
return model
def densenet169(pretrained=False, **kwargs):
r"""Densenet-169 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet169']))
return model
def densenet201(pretrained=False, **kwargs):
r"""Densenet-201 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet201']))
return model
def densenet161(pretrained=False, **kwargs):
r"""Densenet-161 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet161']))
return model
class _DenseLayer(nn.Sequential):
# pylint: disable=expression-not-assigned
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, dilation=1):
super(_DenseLayer, self).__init__()
self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
self.add_module('relu.1', nn.ReLU(inplace=True)),
self.add_module('conv.1', nn.Conv2d(
num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
self.add_module('relu.2', nn.ReLU(inplace=True)),
self.add_module('conv.2', nn.Conv2d(
bn_size * growth_rate, growth_rate, kernel_size=3, stride=1,
padding=dilation, dilation=dilation, bias=False)),
self.drop_rate = drop_rate
def forward(self, x):
new_features = super(_DenseLayer, self).forward(x)
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
return torch.cat([x, new_features], 1)
class _DenseBlock(nn.Sequential):
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, dilation=1):
super(_DenseBlock, self).__init__()
for i in range(num_layers):
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate, dilation=dilation)
self.add_module('denselayer%d' % (i + 1), layer)
class _Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features, stride, dilation=1):
super(_Transition, self).__init__()
self.add_module('norm', nn.BatchNorm2d(num_input_features))
self.add_module('relu', nn.ReLU(inplace=True))
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
kernel_size=1, stride=1, bias=False))
self.add_module('pool', nn.DilatedAvgPool2d(kernel_size=2, stride=stride,
dilation=dilation))
class DenseNet(nn.Module):
r"""Dilated DenseNet.
For correctly dilation of transition layer fo DenseNet, we implement the :class:`encoding.nn.DilatedAvgPool2d`.
Args:
growth_rate (int) - how many filters to add each layer (`k` in paper)
block_config (list of 4 ints) - how many layers in each pooling block
num_init_features (int) - the number of filters to learn in the first convolution layer
bn_size (int) - multiplicative factor for number of bottle neck layers
(i.e. bn_size * k features in the bottleneck layer)
drop_rate (float) - dropout rate after each dense layer
num_classes (int) - number of classification classes
Reference:
Huang, Gao, et al. "Densely Connected Convolutional Networks" *CVPR 2017*
"""
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
super(DenseNet, self).__init__()
# First convolution
self.features = nn.Sequential(OrderedDict([
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
('norm0', nn.BatchNorm2d(num_init_features)),
('relu0', nn.ReLU(inplace=True)),
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
]))
# Each denseblock
strides = [1, 2, 1, 1]
dilations = [1, 1, 2, 4]
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate,
dilation=dilations[i])
self.features.add_module('denseblock%d' % (i + 1), block)
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1:
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2, stride=strides[i+1], dilation=dilations[i])
self.features.add_module('transition%d' % (i + 1), trans)
num_features = num_features // 2
# Final batch norm
self.features.add_module('norm5', nn.BatchNorm2d(num_features))
# Linear layer
self.classifier = nn.Linear(num_features, num_classes)
def forward(self, x):
features = self.features(x)
out = F.relu(features, inplace=True)
# out = F.avg_pool2d(out, kernel_size=7).view(features.size(0), -1)
# out = self.classifier(out)
return out
...@@ -26,15 +26,15 @@ class BasicBlock(nn.Module): ...@@ -26,15 +26,15 @@ class BasicBlock(nn.Module):
"""ResNet BasicBlock """ResNet BasicBlock
""" """
expansion = 1 expansion = 1
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1, def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, previous_dilation=1,
norm_layer=None): norm_layer=None):
super(BasicBlock, self).__init__() super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, bias=False) padding=dilation, dilation=dilation, bias=False)
self.bn1 = norm_layer(planes) self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=False) self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
padding=first_dilation, dilation=first_dilation, bias=False) padding=previous_dilation, dilation=previous_dilation, bias=False)
self.bn2 = norm_layer(planes) self.bn2 = norm_layer(planes)
self.downsample = downsample self.downsample = downsample
self.stride = stride self.stride = stride
...@@ -64,7 +64,7 @@ class Bottleneck(nn.Module): ...@@ -64,7 +64,7 @@ class Bottleneck(nn.Module):
# pylint: disable=unused-argument # pylint: disable=unused-argument
expansion = 4 expansion = 4
def __init__(self, inplanes, planes, stride=1, dilation=1, def __init__(self, inplanes, planes, stride=1, dilation=1,
downsample=None, first_dilation=1, norm_layer=None): downsample=None, previous_dilation=1, norm_layer=None):
super(Bottleneck, self).__init__() super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = norm_layer(planes) self.bn1 = norm_layer(planes)
...@@ -75,7 +75,7 @@ class Bottleneck(nn.Module): ...@@ -75,7 +75,7 @@ class Bottleneck(nn.Module):
self.conv3 = nn.Conv2d( self.conv3 = nn.Conv2d(
planes, planes * 4, kernel_size=1, bias=False) planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = norm_layer(planes * 4) self.bn3 = norm_layer(planes * 4)
self.relu = nn.ReLU(inplace=False) self.relu = nn.ReLU(inplace=True)
self.downsample = downsample self.downsample = downsample
self.dilation = dilation self.dilation = dilation
self.stride = stride self.stride = stride
...@@ -113,6 +113,21 @@ class Bottleneck(nn.Module): ...@@ -113,6 +113,21 @@ class Bottleneck(nn.Module):
class ResNet(nn.Module): class ResNet(nn.Module):
"""Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5. """Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.
Parameters
----------
block : Block
Class for the residual block. Options are BasicBlockV1, BottleneckV1.
layers : list of int
Numbers of layers in each block
classes : int, default 1000
Number of classification classes.
dilated : bool, default False
Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
typically used in Semantic Segmentation.
norm_layer : object
Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
for Synchronized Cross-GPU BachNormalization).
Reference: Reference:
- He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016. - He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
...@@ -120,18 +135,26 @@ class ResNet(nn.Module): ...@@ -120,18 +135,26 @@ class ResNet(nn.Module):
- Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions." - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
""" """
# pylint: disable=unused-variable # pylint: disable=unused-variable
def __init__(self, block, layers, num_classes=1000, norm_layer=None): def __init__(self, block, layers, num_classes=1000, dilated=True, norm_layer=nn.BatchNorm2d):
self.inplanes = 64 self.inplanes = 64
super(ResNet, self).__init__() super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False) bias=False)
self.bn1 = norm_layer(64) self.bn1 = norm_layer(64)
self.relu = nn.ReLU(inplace=False) self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer) self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer) if dilated:
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer) self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
dilation=2, norm_layer=norm_layer)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
dilation=4, norm_layer=norm_layer)
else:
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
norm_layer=norm_layer)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
norm_layer=norm_layer)
self.avgpool = nn.AvgPool2d(7) self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes) self.fc = nn.Linear(512 * block.expansion, num_classes)
...@@ -155,16 +178,16 @@ class ResNet(nn.Module): ...@@ -155,16 +178,16 @@ class ResNet(nn.Module):
layers = [] layers = []
if dilation == 1 or dilation == 2: if dilation == 1 or dilation == 2:
layers.append(block(self.inplanes, planes, stride, dilation=1, layers.append(block(self.inplanes, planes, stride, dilation=1,
downsample=downsample, first_dilation=dilation, norm_layer=norm_layer)) downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
elif dilation == 4: elif dilation == 4:
layers.append(block(self.inplanes, planes, stride, dilation=2, layers.append(block(self.inplanes, planes, stride, dilation=2,
downsample=downsample, first_dilation=dilation, norm_layer=norm_layer)) downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
else: else:
raise RuntimeError("=> unknown dilation size: {}".format(dilation)) raise RuntimeError("=> unknown dilation size: {}".format(dilation))
self.inplanes = planes * block.expansion self.inplanes = planes * block.expansion
for i in range(1, blocks): for i in range(1, blocks):
layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation, layers.append(block(self.inplanes, planes, dilation=dilation, previous_dilation=dilation,
norm_layer=norm_layer)) norm_layer=norm_layer))
return nn.Sequential(*layers) return nn.Sequential(*layers)
......
"""Encoding Autograd Fuctions""" """Encoding Autograd Fuctions"""
from .encoding import * from .encoding import *
from .syncbn import * from .syncbn import *
from .customize import *
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Encoding Customized Functions"""
import math
import torch
from torch.autograd import Function, Variable
from torch.nn.modules.utils import _pair
from .._ext import encoding_lib
__all__ = ['dilatedavgpool2d']
class _dilatedavgpool2d(Function):
@staticmethod
def forward(ctx, input, kernel_size, stride, padding,
dilation=1):
ctx.kH, ctx.kW = _pair(kernel_size)
ctx.dH, ctx.dW = _pair(stride if stride is not None else kernel_size)
ctx.padH, ctx.padW = _pair(padding)
ctx.dilationH, ctx.dilationW = _pair(dilation)
b, c, h, w = input.size()
if ctx.dH == 1 and ctx.dW == 1:
# keep the size for dilated avgpool
ow, oh = w, h
else:
ow = math.floor(float(w-ctx.kW+2*ctx.padW)/float(ctx.dW)) +1
oh = math.floor(float(h-ctx.kH+2*ctx.padH)/float(ctx.dH)) +1
with torch.cuda.device_of(input):
output = input.new(b, c, oh, ow)
ctx.save_for_backward(input)
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_DilatedAvgPool2d_Forward(
input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
ctx.padW, ctx.dilationH, ctx.dilationW)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_DilatedAvgPool2d_Forward(
input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
ctx.padW, ctx.dilationH, ctx.dilationW)
else:
raise RuntimeError('Unimplemented data type!')
return output
@staticmethod
def backward(ctx, gradOutput):
input, = ctx.saved_variables
with torch.cuda.device_of(input):
gradInput = Variable(input.data.new().resize_as_(input.data))
if isinstance(input.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Float_DilatedAvgPool2d_Backward(
gradInput.data, gradOutput.data,
ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
ctx.dilationH, ctx.dilationW)
elif isinstance(input.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Double_DilatedAvgPool2d_Backward(
gradInput.data, gradOutput.data,
ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
ctx.dilationH, ctx.dilationW)
else:
raise RuntimeError('Unimplemented data type!')
return gradInput, None, None, None, None
def dilatedavgpool2d(input, kernel_size, stride=None, padding=0,
dilation=1):
"""Dilated Average Pool 2d, for dilation of DenseNet.
Reference:
Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang,
Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
Applies 2D average-pooling operation in kh x kw regions by step size
dh x dw steps. The number of output features is equal to the number of
input planes.
See :class:`~encoding.nn.DilatedAvgPool2d` for details and output shape.
Args:
input: input tensor (minibatch x in_channels x iH x iW)
kernel_size: size of the pooling region, a single number or a
tuple (kh x kw)
stride: stride of the pooling operation, a single number or a
tuple (sh x sw). Default is equal to kernel size
padding: implicit zero padding on the input, a single number or
a tuple (padh x padw), Default: 0
dilation: the dilation parameter similar to Conv2d
"""
return _dilatedavgpool2d.apply(input, kernel_size, stride, padding, dilation)
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang ## Created by: Hang Zhang
## ECE Department, Rutgers University ## Email: zhanghang0704@gmail.com
## Email: zhang.hang@rutgers.edu ## Copyright (c) 2018
## Copyright (c) 2017
## ##
## This source code is licensed under the MIT-style license found in the ## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree ## LICENSE file in the root directory of this source tree
...@@ -11,7 +10,7 @@ ...@@ -11,7 +10,7 @@
"""Functions for Encoding Layer""" """Functions for Encoding Layer"""
import torch import torch
from torch.autograd import Function, Variable from torch.autograd import Function, Variable
from .._ext import encoding_lib from .. import lib
__all__ = ['aggregate', 'scaledL2'] __all__ = ['aggregate', 'scaledL2']
...@@ -20,47 +19,27 @@ class _aggregate(Function): ...@@ -20,47 +19,27 @@ class _aggregate(Function):
def forward(ctx, A, X, C): def forward(ctx, A, X, C):
# A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD) # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
ctx.save_for_backward(A, X, C) ctx.save_for_backward(A, X, C)
B, _, K = A.size() if A.is_cuda:
D = X.size(2) E = lib.gpu.aggregate_forward(A, X, C)
with torch.cuda.device_of(A):
E = A.new(B, K, D)
if isinstance(A, torch.cuda.FloatTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregate_forward(E, A, X, C)
elif isinstance(A, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregate_forward(E, A, X, C)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
return E return E
@staticmethod @staticmethod
def backward(ctx, gradE): def backward(ctx, gradE):
A, X, C = ctx.saved_variables A, X, C = ctx.saved_variables
with torch.cuda.device_of(A): if A.is_cuda:
gradA = Variable(A.data.new().resize_as_(A.data)) gradA, gradX, gradC = lib.gpu.aggregate_backward(gradE, A, X, C)
gradX = Variable(A.data.new().resize_as_(X.data))
gradC = Variable(A.data.new().resize_as_(C.data))
if isinstance(A.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(A.data):
encoding_lib.Encoding_Float_aggregate_backward(gradA.data, \
gradE.data, A.data, X.data, C.data)
elif isinstance(A.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A.data):
encoding_lib.Encoding_Double_aggregate_backward(gradA.data, \
gradE.data, A.data, X.data, C.data)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
gradX.data.copy_(torch.bmm(A, gradE).data)
gradC.data.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0).data)
return gradA, gradX, gradC return gradA, gradX, gradC
def aggregate(A, X, C): def aggregate(A, X, C):
r""" r""" Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
to the codewords (:math:`C`) with assignment weights (:math:`A`). to the codewords (:math:`C`) with assignment weights (:math:`A`).
.. math:: .. math::
e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k) e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k)
Shape: Shape:
...@@ -77,53 +56,31 @@ def aggregate(A, X, C): ...@@ -77,53 +56,31 @@ def aggregate(A, X, C):
>>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True) >>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True)
>>> func = encoding.aggregate() >>> func = encoding.aggregate()
>>> E = func(A, X, C) >>> E = func(A, X, C)
""" """
return _aggregate.apply(A, X, C) return _aggregate.apply(A, X, C)
class _scaledL2(Function): class _scaledL2(Function):
@staticmethod @staticmethod
def forward(ctx, X, C, S): def forward(ctx, X, C, S):
B, N, _ = X.size() if X.is_cuda:
K = C.size(0) SL = lib.gpu.scaled_l2_forward(X, C, S)
with torch.cuda.device_of(X):
SL = X.new(B, N, K)
if isinstance(X, torch.cuda.FloatTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
elif isinstance(X, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
ctx.save_for_backward(X, C, S, SL) ctx.save_for_backward(X, C, S, SL)
return SL return SL
@staticmethod @staticmethod
def backward(ctx, gradSL): def backward(ctx, gradSL):
X, C, S, SL = ctx.saved_variables X, C, S, SL = ctx.saved_variables
K = C.size(0) if X.is_cuda:
with torch.cuda.device_of(X.data): gradX, gradC, gradS = lib.gpu.scaled_l2_backward(gradSL, X, C, S, SL)
gradX = Variable(X.data.new().resize_as_(X.data))
gradC = Variable(X.data.new().resize_as_(C.data))
gradS = Variable(X.data.new().resize_as_(S.data))
if isinstance(X.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(X.data):
encoding_lib.Encoding_Float_scaledl2_backward(gradSL.data, \
gradX.data, gradC.data, X.data, C.data, S.data)
elif isinstance(X.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X.data):
encoding_lib.Encoding_Double_scaledl2_backward(gradSL.data, \
gradX.data, gradC.data, X.data, C.data, S.data)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
gradS.data.copy_((gradSL*(SL/S.view(1, 1, K))).sum(0).sum(0).data)
return gradX, gradC, gradS return gradX, gradC, gradS
def scaledL2(X, C, S): def scaledL2(X, C, S):
r""" r""" scaledL2 distance
scaledL2 distance
.. math:: .. math::
sl_{ik} = s_k \|x_i-c_k\|^2 sl_{ik} = s_k \|x_i-c_k\|^2
...@@ -134,6 +91,5 @@ def scaledL2(X, C, S): ...@@ -134,6 +91,5 @@ def scaledL2(X, C, S):
(where :math:`B` is batch, :math:`N` is total number of features, (where :math:`B` is batch, :math:`N` is total number of features,
:math:`K` is number is codewords, :math:`D` is feature dimensions.) :math:`K` is number is codewords, :math:`D` is feature dimensions.)
- Output: :math:`E\in\mathcal{R}^{B\times N\times K}` - Output: :math:`E\in\mathcal{R}^{B\times N\times K}`
""" """
return _scaledL2.apply(X, C, S) return _scaledL2.apply(X, C, S)
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang ## Created by: Hang Zhang
## ECE Department, Rutgers University ## Email: zhanghang0704@gmail.com
## Email: zhang.hang@rutgers.edu ## Copyright (c) 2018
## Copyright (c) 2017
## ##
## This source code is licensed under the MIT-style license found in the ## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree ## LICENSE file in the root directory of this source tree
...@@ -11,9 +10,9 @@ ...@@ -11,9 +10,9 @@
"""Synchronized Cross-GPU Batch Normalization functions""" """Synchronized Cross-GPU Batch Normalization functions"""
import torch import torch
from torch.autograd import Variable, Function from torch.autograd import Variable, Function
from .._ext import encoding_lib from .. import lib
__all__ = ['sum_square', 'batchnormtrain', 'batchnormeval'] __all__ = ['sum_square', 'batchnormtrain']
def sum_square(input): def sum_square(input):
r"""Calculate sum of elements and sum of squares for Batch Normalization""" r"""Calculate sum of elements and sum of squares for Batch Normalization"""
...@@ -24,91 +23,45 @@ class _sum_square(Function): ...@@ -24,91 +23,45 @@ class _sum_square(Function):
@staticmethod @staticmethod
def forward(ctx, input): def forward(ctx, input):
ctx.save_for_backward(input) ctx.save_for_backward(input)
C = input.size(1) if input.is_cuda:
with torch.cuda.device_of(input): xsum, xsqusum = lib.gpu.sumsquare_forward(input)
xsum = input.new().resize_(C).zero_()
xsquare = input.new().resize_(C).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_sum_square_Forward(
input, xsum, xsquare)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_sum_square_Forward(
input, xsum, xsquare)
else: else:
raise RuntimeError('Unimplemented data type!', type(input)) raise NotImplemented
return xsum, xsquare return xsum, xsqusum
@staticmethod @staticmethod
def backward(ctx, gradSum, gradSquare): def backward(ctx, gradSum, gradSquare):
input, = ctx.saved_variables input, = ctx.saved_variables
with torch.cuda.device_of(input.data): if input.is_cuda:
gradInput = Variable(input.data.new().resize_as_(input.data).zero_()) gradInput = lib.gpu.sumsquare_backward(input, gradSum, gradSquare)
if isinstance(input.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Float_sum_square_Backward(
gradInput.data, input.data, gradSum.data, gradSquare.data)
elif isinstance(input.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Double_sum_square_Backward(
gradInput.data, input.data, gradSum.data, gradSquare.data)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
return gradInput return gradInput
class _batchnorm(Function): class _batchnormtrain(Function):
def __init__(self, training=False): @staticmethod
super(_batchnorm, self).__init__() def forward(ctx, input, mean, std, gamma, beta):
self.training = training ctx.save_for_backward(input, mean, std, gamma, beta)
if input.is_cuda:
def forward(self, input, gamma, beta, mean, std): output = lib.gpu.batchnorm_forward(input, mean, std, gamma, beta)
self.save_for_backward(input, gamma, beta, mean, std)
assert(input.dim() == 3)
with torch.cuda.device_of(input):
invstd = 1.0 / std
output = input.new().resize_as_(input)
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_batchnorm_Forward(output, \
input, mean, invstd, gamma, beta)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_batchnorm_Forward(output, \
input, mean, invstd, gamma, beta)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
return output return output
def backward(self, gradOutput): @staticmethod
input, gamma, beta, mean, std = self.saved_tensors def backward(ctx, gradOutput):
invstd = 1.0 / std input, mean, std, gamma, beta = ctx.saved_variables
with torch.cuda.device_of(input): if gradOutput.is_cuda:
gradInput = gradOutput.new().resize_as_(input).zero_() gradInput, gradMean, gradStd, gradGamma, gradBeta = \
gradGamma = gradOutput.new().resize_as_(gamma).zero_() lib.gpu.batchnorm_backward(gradOutput, input, mean,
gradBeta = gradOutput.new().resize_as_(beta).zero_() std, gamma, beta, True)
gradMean = gradOutput.new().resize_as_(mean).zero_()
gradStd = gradOutput.new().resize_as_(std).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_batchnorm_Backward(
gradOutput, input, gradInput, gradGamma, gradBeta,
mean, invstd, gamma, beta, gradMean, gradStd,
self.training)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_batchnorm_Backward(
gradOutput, input, gradInput, gradGamma, gradBeta,
mean, invstd, gamma, beta, gradMean, gradStd,
self.training)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
return gradInput, gradGamma, gradBeta, gradMean, gradStd return gradInput, gradMean, gradStd, gradGamma, gradBeta
def batchnormtrain(input, gamma, beta, mean, std): def batchnormtrain(input, mean, std, gamma, beta):
r"""Applies Batch Normalization over a 3d input that is seen as a r"""Applies Batch Normalization over a 3d input that is seen as a
mini-batch. mini-batch.
...@@ -123,14 +76,4 @@ def batchnormtrain(input, gamma, beta, mean, std): ...@@ -123,14 +76,4 @@ def batchnormtrain(input, gamma, beta, mean, std):
- Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input) - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
""" """
return _batchnorm(True)(input, gamma, beta, mean, std) return _batchnormtrain.apply(input, mean, std, gamma, beta)
def batchnormeval(input, gamma, beta, mean, std):
r"""Applies Batch Normalization over a 3d input that is seen as a
mini-batch.
Please see encoding.batchnormtrain_
"""
return _batchnorm(False)(input, gamma, beta, mean, std)
// The maximum number of threads in a block
const int WARP_SIZE = 32;
const int MAX_BLOCK_SIZE = 512;
// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
static int getNumThreads(int nElem) {
int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
for (int i = 0; i != 5; ++i) {
if (nElem <= threadSizes[i]) {
return threadSizes[i];
}
}
return MAX_BLOCK_SIZE;
}
__device__ __forceinline__ int getMSB(int val) {
return 31 - __clz(val);
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/device_tensor.h"
#else
template <int Dim>
THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
if (!t) {
return THCDeviceTensor<real, Dim>();
}
int inDim = THCTensor_(nDimension)(state, t);
return toDeviceTensor<real, Dim>(state, t);
/*
if (inDim == Dim) {
return toDeviceTensor<real, Dim>(state, t);
}
// View in which the last dimensions are collapsed or expanded as needed
THAssert(THCTensor_(isContiguous)(state, t));
int size[Dim];
for (int i = 0; i < Dim || i < inDim; ++i) {
if (i < Dim && i < inDim) {
size[i] = t->size[i];
} else if (i < Dim) {
size[i] = 1;
} else {
size[Dim - 1] *= t->size[i];
}
}
return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
*/
}
struct Encoding_(Float2)
/*
* For reduce sum calcualtion of two elements
*/
{
real v1, v2;
__device__ Encoding_(Float2)() {}
__device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
__device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
__device__ Encoding_(Float2)(int v) : v1(v), v2(v) {}
__device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a)
{
v1 += a.v1;
v2 += a.v2;
return *this;
}
};
static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
#if CUDA_VERSION >= 9000
unsigned int mask = 0xffffffff;
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += __shfl_xor_sync(mask, val, 1 << i, WARP_SIZE);
}
#else
#if __CUDA_ARCH__ >= 300
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += __shfl_xor(val, 1 << i, WARP_SIZE);
}
#else
__shared__ real values[MAX_BLOCK_SIZE];
values[threadIdx.x] = val;
__threadfence_block();
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
for (int i = 1; i < WARP_SIZE; i++) {
val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
}
#endif
#endif
return val;
}
static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(
Encoding_(Float2) value)
{
value.v1 = Encoding_(rwarpSum)(value.v1);
value.v2 = Encoding_(rwarpSum)(value.v2);
return value;
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/encoding_kernel.c"
#else
__global__ void Encoding_(Aggregate_Forward_kernel) (
THCDeviceTensor<real, 3> E,
THCDeviceTensor<real, 3> A,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, N;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x;
k = blockIdx.y;
N = X.getSize(1);
/* main operation */
Encoding_(AggOp) g(A,X,C);
E[b][k][d] = Encoding_(reduce_agg)(g,b,k,d,N);
}
void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_,
THCTensor *A_, THCTensor *X_, THCTensor *C_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, E_, A_, X_, C_);
if (THCTensor_(nDimension)(state, E_) != 3 ||
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
// B, K, D
dim3 blocks(C.getSize(1), C.getSize(0), X.getSize(0));
// N
dim3 threads(getNumThreads(X.getSize(1)));
Encoding_(Aggregate_Forward_kernel)<<<blocks, threads, 0, stream>>>
(E, A, X, C);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(Aggregate_Backward_kernel) (
THCDeviceTensor<real, 3> GA,
THCDeviceTensor<real, 3> GE,
THCDeviceTensor<real, 3> A,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C)
/*
* aggregating backward kernel function
* G (dl/dR), L (dl/dE), A
*/
{
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y;
k = blockIdx.x;
D = GE.getSize(2);
/* main operation */
Encoding_(AggBackOp) g(GE,X,C);
GA[b][i][k] = Encoding_(reduce_aggback)(g,b,i,k,D);
}
void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_,
THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_)
/*
* aggregate backward to assignment weights
* G (dl/dR), L (dl/dE), A
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 5, GA_, GE_, A_, X_, C_);
if (THCTensor_(nDimension)(state, GA_) != 3 ||
THCTensor_(nDimension)(state, GE_) != 3 ||
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GA = devicetensor<3>(state, GA_);
THCDeviceTensor<real, 3> GE = devicetensor<3>(state, GE_);
THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
// B, K, D
dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
// N
dim3 threads(getNumThreads(C.getSize(1)));
Encoding_(Aggregate_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GA, GE, A, X, C);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(ScaledL2_Forward_kernel) (
THCDeviceTensor<real, 3> SL,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x;
i = blockIdx.y;
D = X.getSize(2);
/* main operation */
Encoding_(L2Op) g(X,C);
SL[b][i][k] = S[k] * Encoding_(reduce_sl2)(g,b,i,k,D);;
}
void Encoding_(ScaledL2_Forward)(
THCState *state, THCTensor *SL_, THCTensor *X_,
THCTensor *C_, THCTensor *S_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, SL_, X_, C_, S_);
if (THCTensor_(nDimension)(state, SL_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2 ||
THCTensor_(nDimension)(state, S_) != 1)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> SL = devicetensor<3>(state, SL_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
dim3 threads(getNumThreads(C.getSize(1)));
Encoding_(ScaledL2_Forward_kernel)<<<blocks, threads, 0, stream>>>
(SL, X, C, S);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(ScaledL2X_Backward_kernel) (
THCDeviceTensor<real, 3> GSL,
THCDeviceTensor<real, 3> GX,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
*/
{
/* declarations of the variables */
int b, d, i, K;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x;
i = blockIdx.y;
K = C.getSize(0);
/* main operation */
Encoding_(L2XBackOp) g(GSL,X,C,S);
GX[b][i][d] = Encoding_(reduce_sl2xback)(g,b,i,d,K);
}
__global__ void Encoding_(ScaledL2C_Backward_kernel) (
THCDeviceTensor<real, 3> GSL,
THCDeviceTensor<real, 2> GC,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
*/
{
/* declarations of the variables */
int k, d, B, N;
/* Get the index and channels */
d = blockIdx.x;
k = blockIdx.y;
B = X.getSize(0);
N = X.getSize(1);
/* main operation */
Encoding_(L2CBackOp) g(GSL,X,C,S);
GC[k][d] = Encoding_(reduce_sl2cback)(g,k,d,B,N);
}
void Encoding_(ScaledL2_Backward)(
THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
THCTensor *X_, THCTensor *C_, THCTensor *S_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, GSL_, GX_, GC_, X_, C_, S_);
if (THCTensor_(nDimension)(state, GSL_) != 3 ||
THCTensor_(nDimension)(state, GX_) != 3 ||
THCTensor_(nDimension)(state, GC_) != 2 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2 ||
THCTensor_(nDimension)(state, S_) != 1)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GSL = devicetensor<3>(state, GSL_);
THCDeviceTensor<real, 3> GX = devicetensor<3>(state, GX_);
THCDeviceTensor<real, 2> GC = devicetensor<2>(state, GC_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(X.getSize(2), X.getSize(1), X.getSize(0));
dim3 threads(getNumThreads(C.getSize(0)));
Encoding_(ScaledL2X_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GSL, GX, X, C, S);
THCudaCheck(cudaGetLastError());
dim3 blocks2(C.getSize(1), C.getSize(0));
dim3 threads2(getNumThreads(X.getSize(1)));
Encoding_(ScaledL2C_Backward_kernel)<<<blocks2, threads2, 0, stream>>>
(GSL, GC, X, C, S);
THCudaCheck(cudaGetLastError());
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/encoding_kernel.h"
#else
void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_,
THCTensor *A_, THCTensor *X_, THCTensor *C_);
void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_,
THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_);
void Encoding_(ScaledL2_Forward)( THCState *state, THCTensor *SL_,
THCTensor *X_, THCTensor *C_, THCTensor *S_);
void Encoding_(ScaledL2_Backward)(
THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
THCTensor *X_, THCTensor *C_, THCTensor *S_);
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/encoding_utils.c"
#else
struct Encoding_(AggOp) {
__device__ Encoding_(AggOp)(THCDeviceTensor<real, 3> a,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c)
: A(a), X(x), C(c) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> A;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
};
__device__ real Encoding_(reduce_agg)(
Encoding_(AggOp) op,
int b, int k, int d, int N)
{
real sum = 0;
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(b,x,k,d);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(AggBackOp) {
__device__ Encoding_(AggBackOp)(THCDeviceTensor<real, 3> ge,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c)
: GE(ge), X(x), C(c) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return GE[b][k][d].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> GE;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
};
__device__ real Encoding_(reduce_aggback)(
Encoding_(AggBackOp) op,
int b, int i, int k, int D)
{
real sum = 0;
for (int x = threadIdx.x; x < D; x += blockDim.x) {
sum += op(b,i,k,x);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(L2Op) {
__device__ Encoding_(L2Op)(THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c)
: X(x), C(c) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
real r = X[b][i][d].ldg() - C[k][d].ldg();
return r * r;
}
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
};
__device__ real Encoding_(reduce_sl2)(
Encoding_(L2Op) op,
int b, int i, int k, int D)
{
real sum = 0;
for (int x = threadIdx.x; x < D; x += blockDim.x) {
sum += op(b,i,k,x);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(L2XBackOp) {
__device__ Encoding_(L2XBackOp)(
THCDeviceTensor<real, 3> gsl,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c,
THCDeviceTensor<real, 1> s
) : GSL(gsl), X(x), C(c), S(s) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return 2*S[k].ldg() * GSL[b][i][k].ldg() *
(X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> GSL;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
THCDeviceTensor<real, 1> S;
};
__device__ real Encoding_(reduce_sl2xback)(
Encoding_(L2XBackOp) op,
int b, int i, int d, int K)
{
real sum = 0;
for (int x = threadIdx.x; x < K; x += blockDim.x) {
sum += op(b,i,x,d);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(L2CBackOp) {
__device__ Encoding_(L2CBackOp)(
THCDeviceTensor<real, 3> gsl,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c,
THCDeviceTensor<real, 1> s
) : GSL(gsl), X(x), C(c), S(s) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return -2*S[k].ldg() * GSL[b][i][k].ldg() *
(X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> GSL;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
THCDeviceTensor<real, 1> S;
};
__device__ real Encoding_(reduce_sl2cback)(
Encoding_(L2CBackOp) op,
int k, int d, int B, int N)
{
real sum = 0;
for (int batch = 0; batch < B; ++batch) {
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(batch,x,k,d);
}
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/pooling_kernel.c"
#else
__global__ void Encoding_(DilatedAvgPool_Forward_kernel) (
THCDeviceTensor<real, 4> X,
THCDeviceTensor<real, 4> Y,
int kH, int kW, int dH, int dW,
int padH, int padW, int dilationH, int dilationW
)
/*
* dilated avgpool2d forward kernel function
*/
{
/* declarations of the variables */
int bc, b, c, w, h, C;
real sum;
/* Get the index and channels */
bc = blockIdx.z;
w = blockIdx.x * blockDim.x + threadIdx.x;
h = blockIdx.y * blockDim.y + threadIdx.y;
C = Y.getSize(1);
b = bc / C;
c = bc - b*C;
/* boundary check for output */
if (w >= Y.getSize(3) || h >= Y.getSize(2)) return;
int hstart = h*dH -padH;
int wstart = w*dW -padW;
int hend = min(hstart + kH*dilationH, X.getSize(2));
int wend = min(wstart + kW*dilationW, X.getSize(3));
hstart = max(hstart, 0);
wstart = max(wstart, 0);
int pool_size = ((hend - hstart - 1) / dilationH + 1) *
((wend - wstart - 1) / dilationW + 1);
sum = 0;
for (int th=hstart; th < hend; th+=dilationH) {
for (int tw=wstart; tw < wend; tw+=dilationW) {
sum += X[b][c][th][tw];
}
}
Y[b][c][h][w] = sum / pool_size;
}
void Encoding_(DilatedAvgPool_Forward)(THCState *state,
THCTensor *X_, THCTensor *Y_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW)
/*
* dilated avgpool2d forward function
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, X_, Y_);
if (THCTensor_(nDimension)(state, X_) != 4 ||
THCTensor_(nDimension)(state, Y_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 4> X = devicetensor<4>(state, X_);
THCDeviceTensor<real, 4> Y = devicetensor<4>(state, Y_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(Y.getSize(3)/16+1, Y.getSize(2)/16+1,
Y.getSize(1)*Y.getSize(0));
Encoding_(DilatedAvgPool_Forward_kernel)<<<blocks, threads, 0, stream>>>
(X, Y, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
THCudaCheck(cudaGetLastError());
}
__global__ void Encoding_(DilatedAvgPool_Backward_kernel) (
THCDeviceTensor<real, 4> gradX,
THCDeviceTensor<real, 4> gradY,
int kH, int kW, int dH, int dW,
int padH, int padW, int dilationH, int dilationW
)
/*
* dilated avgpool2d forward kernel function
*/
{
/* declarations of the variables */
int bc, b, c, w, h, C;
real sum;
/* Get the index and channels */
bc = blockIdx.z;
w = blockIdx.x * blockDim.x + threadIdx.x;
h = blockIdx.y * blockDim.y + threadIdx.y;
C = gradX.getSize(1);
b = bc / C;
c = bc - b*C;
/* boundary check for output */
if (w >= gradX.getSize(3) || h >= gradX.getSize(2)) return;
int phstart = (h + padH < ((kH-1)*dilationH+1)) ? 0 :
(h + padH - ((kH-1)*dilationH+1))/dH + 1;
int pwstart = (w + padW < ((kW-1)*dilationW+1)) ? 0 :
(w + padW - ((kW-1)*dilationW+1))/dW + 1;
int phend = min((h+padH)/dH+1, gradY.getSize(2));
int pwend = min((w+padW)/dW+1, gradY.getSize(3));
sum = 0;
int hstart, wstart, hend, wend, pool_size;
for (int ph=phstart; ph < phend; ++ph) {
for (int pw=pwstart; pw < pwend; ++pw) {
hstart = ph*dW -padH;
wstart = pw*dW -padW;
hend = min(hstart + kH*dilationH, gradX.getSize(2));
wend = min(wstart + kW*dilationW, gradX.getSize(3));
hstart = max(hstart, 0);
wstart = max(wstart, 0);
pool_size = ((hend - hstart - 1) / dilationH + 1) *
((wend - wstart - 1) / dilationW + 1);
sum += gradY[b][c][ph][pw] / pool_size;
}
}
gradX[b][c][h][w] = sum;
}
void Encoding_(DilatedAvgPool_Backward)(THCState *state,
THCTensor *gradX_, THCTensor *gradY_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW)
/*
* dilated avgpool2d forward function
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, gradX_, gradY_);
if (THCTensor_(nDimension)(state, gradX_) != 4 ||
THCTensor_(nDimension)(state, gradY_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 4> gradX = devicetensor<4>(state, gradX_);
THCDeviceTensor<real, 4> gradY = devicetensor<4>(state, gradY_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(gradX.getSize(3)/16+1, gradX.getSize(2)/16+1,
gradX.getSize(1)*gradX.getSize(0));
Encoding_(DilatedAvgPool_Backward_kernel)<<<blocks, threads, 0, stream>>>
(gradX, gradY, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
THCudaCheck(cudaGetLastError());
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/pooling_kernel.h"
#else
void Encoding_(DilatedAvgPool_Forward)(THCState *state,
THCTensor *X_, THCTensor *Y_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
void Encoding_(DilatedAvgPool_Backward)(THCState *state,
THCTensor *gradX_, THCTensor *gradY_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/syncbn_kernel.c"
#else
__global__ void Encoding_(BatchNorm_Forward_kernel) (
THCDeviceTensor<real, 3> output,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> mean,
THCDeviceTensor<real, 1> invstd,
THCDeviceTensor<real, 1> gamma,
THCDeviceTensor<real, 1> beta)
{
int c = blockIdx.x;
/* main operation */
for (int b = 0; b < input.getSize(0); ++b) {
for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
real inp = input[b][c][x].ldg();
output[b][c][x] = gamma[c].ldg() * (inp - mean[c].ldg()) *
invstd[c].ldg() + beta[c].ldg();
}
}
}
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_)
/*
* batch norm forward function
* assuming the input is already flaghten
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, output_, input_, mean_, invstd_,
gamma_, beta_);
if (THCTensor_(nDimension)(state, output_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, mean_) != 1 ||
THCTensor_(nDimension)(state, invstd_) != 1 ||
THCTensor_(nDimension)(state, gamma_) != 1 ||
THCTensor_(nDimension)(state, beta_) != 1)
THError("BatchNorm2d forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> output = devicetensor<3>(state, output_);
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> mean = devicetensor<1>(state, mean_);
THCDeviceTensor<real, 1> invstd = devicetensor<1>(state, invstd_);
THCDeviceTensor<real, 1> gamma = devicetensor<1>(state, gamma_);
THCDeviceTensor<real, 1> beta = devicetensor<1>(state, beta_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(BatchNorm_Forward_kernel)<<<blocks, threads, 0, stream>>>(
output, input, mean, invstd, gamma, beta);
THCudaCheck(cudaGetLastError());
}
struct Encoding_(GradOp) {
__device__ Encoding_(GradOp)(real m, THCDeviceTensor<real, 3> i, THCDeviceTensor<real, 3> g)
: mean(m), input(i), gradOutput(g) {}
__device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
real g = gradOutput[batch][plane][n].ldg();
real c = input[batch][plane][n].ldg() - mean;
return Encoding_(Float2)(g, g * c);
}
real mean;
THCDeviceTensor<real, 3> input;
THCDeviceTensor<real, 3> gradOutput;
};
// Sum across (batch, b/c/n) applying Op() pointwise
__device__ Encoding_(Float2) Encoding_(reduce)(
Encoding_(GradOp) op,
THCDeviceTensor<real, 3> tensor,
int plane)
{
Encoding_(Float2) sum = (Encoding_(Float2))0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = Encoding_(warpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ Encoding_(Float2) shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (Encoding_(Float2))0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(warpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
__global__ void Encoding_(BatchNorm_Backward_kernel) (
THCDeviceTensor<real, 3> gradoutput,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 3> gradinput,
THCDeviceTensor<real, 1> gradgamma,
THCDeviceTensor<real, 1> gradbeta,
THCDeviceTensor<real, 1> mean,
THCDeviceTensor<real, 1> invstd,
THCDeviceTensor<real, 1> gamma,
THCDeviceTensor<real, 1> beta,
THCDeviceTensor<real, 1> gradMean,
THCDeviceTensor<real, 1> gradStd,
int train)
{
/* declarations of the variables */
/* Get the index and channels */
int c = blockIdx.x;
/* main operation */
Encoding_(GradOp) g(mean[c], input, gradoutput);
Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
real gradOutputSum = res.v1;
real dotP = res.v2;
//real projScale = dotP * norm * invstd[c].ldg() * invstd[c].ldg();
real gradScale = invstd[c].ldg() * gamma[c].ldg();
if (train && threadIdx.x == 0) {
gradMean[c] = - gradOutputSum * gamma[c].ldg() * invstd[c].ldg();
gradStd[c] = - dotP * gamma[c].ldg() * invstd[c].ldg() * invstd[c].ldg();
}
if (gradinput.numElements() > 0) {
for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
gradinput[batch][c][x] = gradoutput[batch][c][x].ldg() * gradScale;
}
}
}
if (gradgamma.numElements() > 0) {
if (threadIdx.x == 0) {
gradgamma[c] += dotP * invstd[c].ldg();
}
}
if (gradbeta.numElements() > 0) {
if (threadIdx.x == 0) {
gradbeta[c] += gradOutputSum;
}
}
}
void Encoding_(BatchNorm_Backward)(THCState *state,
THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_,
THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_,
THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_,
THCTensor *gradMean_, THCTensor *gradStd_, int train)
/*
* batch norm backward function
* assuming the input is already flaghten
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, gradoutput_, input_, gradinput_,
gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_);
if (THCTensor_(nDimension)(state, gradoutput_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, gradinput_) != 3 ||
THCTensor_(nDimension)(state, gradgamma_) != 1 ||
THCTensor_(nDimension)(state, gradbeta_) != 1 ||
THCTensor_(nDimension)(state, mean_) != 1 ||
THCTensor_(nDimension)(state, invstd_) != 1 ||
THCTensor_(nDimension)(state, gamma_) != 1 ||
THCTensor_(nDimension)(state, beta_) != 1 ||
THCTensor_(nDimension)(state, gradMean_) != 1 ||
THCTensor_(nDimension)(state, gradStd_) != 1 )
THError("BatchNorm2d backward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> gradoutput =
devicetensor<3>(state, gradoutput_);
THCDeviceTensor<real, 3> input =
devicetensor<3>(state, input_);
THCDeviceTensor<real, 3> gradinput =
devicetensor<3>(state, gradinput_);
THCDeviceTensor<real, 1> gradgamma =
devicetensor<1>(state, gradgamma_);
THCDeviceTensor<real, 1> gradbeta = devicetensor<1>(state, gradbeta_);
THCDeviceTensor<real, 1> mean = devicetensor<1>(state, mean_);
THCDeviceTensor<real, 1> invstd = devicetensor<1>(state, invstd_);
THCDeviceTensor<real, 1> gamma = devicetensor<1>(state, gamma_);
THCDeviceTensor<real, 1> beta = devicetensor<1>(state, beta_);
THCDeviceTensor<real, 1> gradMean = devicetensor<1>(state, gradMean_);
THCDeviceTensor<real, 1> gradStd = devicetensor<1>(state, gradStd_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(BatchNorm_Backward_kernel)<<<blocks, threads, 0, stream>>>(
gradoutput, input, gradinput, gradgamma, gradbeta, mean, invstd,
gamma, beta, gradMean, gradStd, train);
THCudaCheck(cudaGetLastError());
}
struct Encoding_(SumOp) {
__device__ Encoding_(SumOp)(THCDeviceTensor<real, 3> i)
: input(i){}
__device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
real g = input[batch][plane][n].ldg();
return Encoding_(Float2)(g, g * g);
}
real mean;
THCDeviceTensor<real, 3> input;
};
// Sum across (batch, x/y/z) applying Op() pointwise
__device__ Encoding_(Float2) Encoding_(reduce_sum)(Encoding_(SumOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
Encoding_(Float2) sum = (Encoding_(Float2))0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = Encoding_(warpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ Encoding_(Float2) shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (Encoding_(Float2))0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(warpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
__global__ void Encoding_(Sum_Square_Forward_kernel) (
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> sum,
THCDeviceTensor<real, 1> square)
{
int c = blockIdx.x;
/* main operation */
Encoding_(SumOp) g(input);
Encoding_(Float2) res = Encoding_(reduce_sum)(g, input, c);
real xsum = res.v1;
real xsquare = res.v2;
if (threadIdx.x == 0) {
sum[c] = xsum;
square[c] = xsquare;
}
}
void Encoding_(Sum_Square_Forward)(THCState *state,
THCTensor *input_, THCTensor *sum_, THCTensor *square_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 3, input_, sum_, square_);
if (THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, sum_) != 1 ||
THCTensor_(nDimension)(state, square_) != 1)
THError("Sum_Square forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> sum = devicetensor<1>(state, sum_);
THCDeviceTensor<real, 1> square = devicetensor<1>(state, square_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(Sum_Square_Forward_kernel)<<<blocks, threads, 0, stream>>>(
input, sum, square);
THCudaCheck(cudaGetLastError());
}
__global__ void Encoding_(Sum_Square_Backward_kernel) (
THCDeviceTensor<real, 3> gradInput,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> gradSum,
THCDeviceTensor<real, 1> gradSquare)
{
int c = blockIdx.x;
/* main operation */
for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
{
gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
input[batch][c][x];
}
}
}
void Encoding_(Sum_Square_Backward)(THCState *state,
THCTensor *gradInput_, THCTensor *input_,
THCTensor *gradSum_, THCTensor *gradSquare_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, gradInput_, input_, gradSum_,
gradSquare_);
if (THCTensor_(nDimension)(state, gradInput_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, gradSum_) != 1 ||
THCTensor_(nDimension)(state, gradSquare_) != 1)
THError("Sum_Square forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> gradInput = devicetensor<3>(state, gradInput_);
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> gradSum = devicetensor<1>(state, gradSum_);
THCDeviceTensor<real, 1> gradSquare =devicetensor<1>(state, gradSquare_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(Sum_Square_Backward_kernel)<<<blocks, threads, 0, stream>>>(
gradInput, input, gradSum, gradSquare);
THCudaCheck(cudaGetLastError());
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/syncbn_kernel.h"
#else
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_);
void Encoding_(BatchNorm_Backward)(THCState *state,
THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_,
THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_,
THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_,
THCTensor *gradMean_, THCTensor *gradStd_, int train);
void Encoding_(Sum_Square_Forward)(THCState *state,
THCTensor *input_, THCTensor *sum_, THCTensor *square_);
void Encoding_(Sum_Square_Backward)(THCState *state,
THCTensor *gradInput, THCTensor *input_,
THCTensor *gradSum_, THCTensor *gradSquare_);
#endif
Make a copy from PyTorch lib to make the compilation easier for users, due to so many questions and requests.
#include <assert.h>
namespace detail {
template <typename T, int N>
__host__ __device__ void copy(T to[N], T from[N]) {
for (int i = 0; i < N; ++i) {
to[i] = from[i];
}
}
} // namespace detail
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor()
: data_(NULL) {
thc_static_assert(Dim > 0);
for (int i = 0; i < Dim; ++i) {
size_[i] = 0;
stride_[i] = (IndexT) 1;
}
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::
#ifdef _MSC_VER
THCDeviceTensor(DataPtrType data, const IndexT (&sizes)[Dim])
#else
THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim])
#endif
: data_(data) {
thc_static_assert(Dim > 0);
for (int i = 0; i < Dim; ++i) {
size_[i] = sizes[i];
}
stride_[Dim - 1] = (IndexT) 1;
for (int i = Dim - 2; i >= 0; --i) {
stride_[i] = stride_[i + 1] * sizes[i + 1];
}
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor(
#ifdef _MSC_VER
DataPtrType data, const IndexT (&sizes)[Dim], const IndexT (&strides)[Dim])
#else
DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
#endif
: data_(data) {
thc_static_assert(Dim > 0);
for (int i = 0; i < Dim; ++i) {
size_[i] = sizes[i];
stride_[i] = strides[i];
}
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int OtherDim>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isSameSizeAndStride(
const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const {
if (Dim != OtherDim) {
return false;
}
for (int i = 0; i < Dim; ++i) {
if (size_[i] != rhs.size_[i]) {
return false;
}
if (stride_[i] != rhs.stride_[i]) {
return false;
}
}
return true;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() {
thc_static_assert(sizeof(U) == sizeof(T));
return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), size_, stride_);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ const THCDeviceTensor<U, Dim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
thc_static_assert(sizeof(U) == sizeof(T));
return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), size_, stride_);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ ptrdiff_t
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
ptrdiff_t size = getSize(0);
for (int i = 1; i < Dim; ++i) {
size *= getSize(i);
}
return size;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguous() const {
return isContiguousRange(0, Dim);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized(int i) const {
if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
return true;
} else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
((getStride(i - 1) / getStride(i)) >= getSize(i))) {
return true;
}
return false;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized() const {
for (int i = 0; i < Dim; ++i) {
if (!isConsistentlySized(i)) {
return false;
}
}
return true;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguousRange(
int first, int last) const {
int64_t prevSize = last < Dim ? getStride(last) * getSize(last) : 1;
for (int i = last - 1; i >= first; --i) {
if (getSize(i) != (IndexT) 1) {
if (getStride(i) == prevSize) {
prevSize *= getSize(i);
} else {
return false;
}
}
}
return true;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::transpose(int dim1,
int dim2) const {
#ifdef __CUDA_ARCH__
// Device code
assert(dim1 >= 0 && dim1 < Dim);
assert(dim1 >= 0 && dim2 < Dim);
#else
// Host code
if (dim1 < 0 || dim1 >= Dim) {
THError("dim1 out of bounds");
}
if (dim2 < 0 || dim2 >= Dim) {
THError("dim2 out of bounds");
}
#endif
IndexT newSize[Dim];
IndexT newStride[Dim];
for (int i = 0; i < Dim; ++i) {
newSize[i] = size_[i];
newStride[i] = stride_[i];
}
IndexT tmp = newSize[dim1];
newSize[dim1] = newSize[dim2];
newSize[dim2] = tmp;
tmp = newStride[dim1];
newStride[dim1] = newStride[dim2];
newStride[dim2] = tmp;
return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastOuter() {
// Can only create tensors of greater dimension
thc_static_assert(NewDim > Dim);
IndexT newSize[NewDim];
IndexT newStride[NewDim];
int shift = NewDim - Dim;
for (int i = 0; i < NewDim; ++i) {
if (i < shift) {
// These are the extended dimensions
newSize[i] = (IndexT) 1;
newStride[i] = size_[0] * stride_[0];
} else {
// Shift the remaining dimensions
newSize[i] = size_[i - shift];
newStride[i] = stride_[i - shift];
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastInner() {
// Can only create tensors of greater dimension
thc_static_assert(NewDim > Dim);
IndexT newSize[NewDim];
IndexT newStride[NewDim];
for (int i = 0; i < NewDim; ++i) {
if (i < Dim) {
// Existing dimensions get copied over
newSize[i] = size_[i];
newStride[i] = stride_[i];
} else {
// Extended dimensions
newSize[i] = (IndexT) 1;
newStride[i] = (IndexT) 1;
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastOuter() {
// Can only create tensors of lesser dimension
thc_static_assert(NewDim < Dim);
// We can't downcast non-contiguous tensors, since it leaves
// garbage data in the tensor. The tensor needs to be contiguous
// in all of the dimensions we are collapsing (no padding in
// them).
bool cont = isContiguousRange(0, Dim - NewDim);
#ifdef __CUDA_ARCH__
// Device code
assert(cont);
#else
// Host code
if (!cont) {
THError("Can only downcast contiguous tensors");
}
#endif
IndexT newSize[NewDim];
IndexT newStride[NewDim];
int ignoredDims = Dim - NewDim;
IndexT collapsedSize = 1;
for (int i = 0; i < Dim; ++i) {
if (i < ignoredDims) {
// Collapse these dimensions
collapsedSize *= getSize(i);
} else {
// Non-collapsed dimensions
if (i == ignoredDims) {
// This is the first non-collapsed dimension
newSize[i - ignoredDims] = collapsedSize * getSize(i);
} else {
// Subsequent non-collapsed dimensions
newSize[i - ignoredDims] = getSize(i);
}
newStride[i - ignoredDims] = getStride(i);
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastInner() {
// Can only create tensors of lesser dimension
thc_static_assert(NewDim < Dim);
// We can't downcast non-contiguous tensors, since it leaves
// garbage data in the tensor. The tensor needs to be contiguous
// in all of the dimensions we are collapsing (no padding in
// them).
bool cont = isContiguousRange(NewDim, Dim);
#ifdef __CUDA_ARCH__
// Device code
assert(cont);
#else
// Host code
if (!cont) {
THError("Can only downcast contiguous tensors");
}
#endif
IndexT newSize[NewDim];
IndexT newStride[NewDim];
IndexT collapsedSize = 1;
for (int i = Dim - 1; i >= 0; --i) {
if (i >= NewDim) {
// Collapse these dimensions
collapsedSize *= getSize(i);
} else {
// Non-collapsed dimensions
if (i == NewDim - 1) {
// This is the first non-collapsed dimension
newSize[i] = collapsedSize * getSize(i);
newStride[i] = getStride(Dim - 1);
} else {
// Subsequent non-collapsed dimensions
newSize[i] = getSize(i);
newStride[i] = getStride(i);
}
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view(DataPtrType at) {
thc_static_assert(SubDim >= 1 && SubDim < Dim);
IndexT viewSizes[SubDim];
IndexT viewStrides[SubDim];
for (int i = 0; i < SubDim; ++i) {
viewSizes[i] = size_[Dim - SubDim + i];
viewStrides[i] = stride_[Dim - SubDim + i];
}
return THCDeviceTensor<T, SubDim, IndexT, PtrTraits>(
at, viewSizes, viewStrides);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view() {
return view<SubDim>(data_);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
void
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::zero(cudaStream_t stream) {
#ifdef __CUDA_ARCH__
assert(isContiguous());
#else
if (!isContiguous()) {
THError("fillAsync only works on contiguous data");
}
#endif
cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream);
}
#ifndef THC_DEVICE_TENSOR_INC
#define THC_DEVICE_TENSOR_INC
#include <cuda.h>
#include <cuda_runtime.h>
// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
template <bool>
struct THCStaticAssert;
template <>
struct THCStaticAssert<true> {
};
#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())
/// Our tensor type
template <typename T,
int Dim,
typename IndexT,
template <typename U> class PtrTraits>
class THCDeviceTensor;
/// Type of a subspace of a tensor
namespace detail {
template <typename TensorType,
int SubDim,
template <typename U> class PtrTraits>
class THCDeviceSubTensor;
}
template <typename T>
struct RestrictPtrTraits {
typedef T* __restrict__ PtrType;
};
template <typename T>
struct DefaultPtrTraits {
typedef T* PtrType;
};
/**
Templated multi-dimensional array that supports strided access of
elements. Main access is through `operator[]`; e.g.,
`tensor[x][y][z]`.
- `T` is the contained type (e.g., `float`)
- `Dim` is the tensor rank
- `IndexT` is the integer type used for size/stride arrays, and for
- all indexing math. Default is `int`, but for large tensors, `int64_t`
- can be used instead.
- `PtrTraits` are traits applied to our data pointer (T*). By default,
- this is just T*, but RestrictPtrTraits can be used to apply T*
- __restrict__ for alias-free analysis.
*/
template <typename T,
int Dim,
typename IndexT = int,
template <typename U> class PtrTraits = DefaultPtrTraits>
class THCDeviceTensor {
public:
enum { NumDim = Dim };
typedef T DataType;
typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType;
typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;
/// Default constructor
__host__ __device__ THCDeviceTensor();
/// Constructor that calculates strides with no padding
__host__ __device__ THCDeviceTensor(DataPtrType data,
#ifdef _MSC_VER
const IndexT (&sizes)[Dim]);
#else
const IndexT sizes[Dim]);
#endif
/// Constructor that takes arbitrary size/stride arrays
__host__ __device__ THCDeviceTensor(DataPtrType data,
#ifdef _MSC_VER
const IndexT (&sizes)[Dim],
const IndexT (&strides)[Dim]);
#else
const IndexT sizes[Dim],
const IndexT strides[Dim]);
#endif
/// Returns true if the two tensors are of the same dimensionality,
/// size and stride.
template <int OtherDim>
__host__ __device__ bool
isSameSizeAndStride(
const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;
/// Cast to a tensor of a different type of the same size and stride
template <typename U>
__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();
/// Const version of `cast`
template <typename U>
__host__ __device__
const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;
/// Returns a raw pointer to the start of our data.
__host__ __device__ __forceinline__ DataPtrType data() {
return data_;
}
/// Returns a raw pointer to the start of our data (const).
__host__ __device__ __forceinline__
const DataPtrType data() const {
return data_;
}
/// Cast to a different datatype
template <typename U>
__host__ __device__ __forceinline__
typename PtrTraits<U>::PtrType dataAs() {
return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
}
/// Cast to a different datatype
template <typename U>
__host__ __device__ __forceinline__
const typename PtrTraits<const U>::PtrType dataAs() const {
return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
}
/// Returns a read/write view of a portion of our tensor.
__host__ __device__ __forceinline__
detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
operator[](IndexT);
/// Returns a read/write view of a portion of our tensor (const).
__host__ __device__ __forceinline__
const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
operator[](IndexT) const;
/// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
/// checking.
__host__ __device__ __forceinline__ int getSize(int i) const {
return size_[i];
}
/// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
/// checking.
__host__ __device__ __forceinline__ int getStride(int i) const {
return stride_[i];
}
/// Returns the total number of elements contained within our data
/// (product of `getSize(i)`)
__host__ __device__ ptrdiff_t numElements() const;
/// Returns the size array.
__host__ __device__ __forceinline__ const IndexT* sizes() const {
return size_;
}
/// Returns the stride array.
__host__ __device__ __forceinline__ const IndexT* strides() const {
return stride_;
}
/// Returns true if there is no padding within the tensor and no
/// re-ordering of the dimensions.
/// ~~~
/// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
/// ~~~
__host__ __device__ bool isContiguous() const;
/// Returns whether a given dimension has only increasing stride
/// from the previous dimension. A tensor that was permuted by
/// exchanging size and stride only will fail this check.
/// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
__host__ __device__ bool isConsistentlySized(int i) const;
// Returns whether at each dimension `stride <= size`.
// If this is not the case then iterating once over the size space will
// touch the same memory locations multiple times.
__host__ __device__ bool isConsistentlySized() const;
/// Returns true if the given dimension range [first, last) has no padding.
__host__ __device__ bool isContiguousRange(int first, int last) const;
/// Returns a tensor of the same dimension after transposing the two
/// dimensions given. Does not actually move elements; transposition
/// is made by permuting the size/stride arrays.
/// If the dimensions are not valid, asserts.
__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
transpose(int dim1, int dim2) const;
/// Upcast a tensor of dimension `D` to some tensor of dimension
/// D' > D by padding the leading dimensions by 1
/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
upcastOuter();
/// Upcast a tensor of dimension `D` to some tensor of dimension
/// D' > D by padding the lowest/most varying dimensions by 1
/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
upcastInner();
/// Downcast a tensor of dimension `D` to some tensor of dimension
/// D' < D by collapsing the leading dimensions. asserts if there is
/// padding on the leading dimensions.
template <int NewDim>
__host__ __device__
THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();
/// Downcast a tensor of dimension `D` to some tensor of dimension
/// D' < D by collapsing the leading dimensions. asserts if there is
/// padding on the leading dimensions.
template <int NewDim>
__host__ __device__
THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();
/// Returns a tensor that is a view of the `SubDim`-dimensional slice
/// of this tensor, starting at `at`.
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
view(DataPtrType at);
/// Returns a tensor that is a view of the `SubDim`-dimensional slice
/// of this tensor, starting where our data begins
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
view();
/// Zeroes out the tensor asynchronously. Asserts if the contents
/// in question are not contiguous.
void zero(cudaStream_t stream = 0);
private:
/// Raw pointer to where the tensor data begins
DataPtrType data_;
/// Array of strides (in sizeof(T) terms) per each dimension
IndexT stride_[Dim];
/// Size per each dimension
IndexT size_[Dim];
};
namespace detail {
/// Specialization for a view of a single value (0-dimensional)
template <typename TensorType, template <typename U> class PtrTraits>
class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
public:
__host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
operator=(typename TensorType::DataType val) {
*data_ = val;
return *this;
}
// operator T&
__host__ __device__ operator typename TensorType::DataType&() {
return *data_;
}
// const operator T& returning const T&
__host__ __device__ operator const typename TensorType::DataType&() const {
return *data_;
}
// operator& returning T*
__host__ __device__ typename TensorType::DataType* operator&() {
return data_;
}
// const operator& returning const T*
__host__ __device__ const typename TensorType::DataType* operator&() const {
return data_;
}
/// Returns a raw accessor to our slice.
__host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
return data_;
}
/// Returns a raw accessor to our slice (const).
__host__ __device__ __forceinline__
const typename TensorType::DataPtrType data() const {
return data_;
}
/// Cast to a different datatype.
template <typename T>
__host__ __device__ T& as() {
return *dataAs<T>();
}
/// Cast to a different datatype (const).
template <typename T>
__host__ __device__ const T& as() const {
return *dataAs<T>();
}
/// Cast to a different datatype
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<T>::PtrType dataAs() {
return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
}
/// Cast to a different datatype (const)
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<const T>::PtrType dataAs() const {
return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
}
/// Use the texture cache for reads
__device__ __forceinline__ typename TensorType::DataType ldg() const {
#if __CUDA_ARCH__ >= 350
return __ldg(data_);
#else
return *data_;
#endif
}
/// Use the texture cache for reads; cast as a particular type
template <typename T>
__device__ __forceinline__ T ldgAs() const {
#if __CUDA_ARCH__ >= 350
return __ldg(dataAs<T>());
#else
return as<T>();
#endif
}
private:
/// One dimension greater can create us
friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;
/// Our parent tensor can create us
friend class THCDeviceTensor<typename TensorType::DataType,
1,
typename TensorType::IndexType,
PtrTraits>;
__host__ __device__ __forceinline__ THCDeviceSubTensor(
TensorType& t,
typename TensorType::DataPtrType data)
: tensor_(t),
data_(data) {
}
/// The tensor we're referencing
TensorType& tensor_;
/// Where our value is located
typename TensorType::DataPtrType const data_;
};
/// A `SubDim`-rank slice of a parent THCDeviceTensor
template <typename TensorType,
int SubDim,
template <typename U> class PtrTraits>
class THCDeviceSubTensor {
public:
/// Returns a view of the data located at our offset (the dimension
/// `SubDim` - 1 tensor).
__host__ __device__ __forceinline__
THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
operator[](typename TensorType::IndexType index) {
return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
tensor_,
data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
}
/// Returns a view of the data located at our offset (the dimension
/// `SubDim` - 1 tensor) (const).
__host__ __device__ __forceinline__
const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
operator[](typename TensorType::IndexType index) const {
return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
tensor_,
data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
}
// operator& returning T*
__host__ __device__ typename TensorType::DataType* operator&() {
return data_;
}
// const operator& returning const T*
__host__ __device__ const typename TensorType::DataType* operator&() const {
return data_;
}
/// Returns a raw accessor to our slice.
__host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
return data_;
}
/// Returns a raw accessor to our slice (const).
__host__ __device__ __forceinline__
const typename TensorType::DataPtrType data() const {
return data_;
}
/// Cast to a different datatype.
template <typename T>
__host__ __device__ T& as() {
return *dataAs<T>();
}
/// Cast to a different datatype (const).
template <typename T>
__host__ __device__ const T& as() const {
return *dataAs<T>();
}
/// Cast to a different datatype
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<T>::PtrType dataAs() {
return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
}
/// Cast to a different datatype (const)
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<const T>::PtrType dataAs() const {
return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
}
/// Use the texture cache for reads
__device__ __forceinline__ typename TensorType::DataType ldg() const {
#if __CUDA_ARCH__ >= 350
return __ldg(data_);
#else
return *data_;
#endif
}
/// Use the texture cache for reads; cast as a particular type
template <typename T>
__device__ __forceinline__ T ldgAs() const {
#if __CUDA_ARCH__ >= 350
return __ldg(dataAs<T>());
#else
return as<T>();
#endif
}
/// Returns a tensor that is a view of the SubDim-dimensional slice
/// of this tensor, starting where our data begins
THCDeviceTensor<typename TensorType::DataType,
SubDim,
typename TensorType::IndexType,
PtrTraits> view() {
return tensor_.template view<SubDim>(data_);
}
private:
/// One dimension greater can create us
friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;
/// Our parent tensor can create us
friend class
THCDeviceTensor<typename TensorType::DataType,
TensorType::NumDim,
typename TensorType::IndexType,
PtrTraits>;
__host__ __device__ __forceinline__ THCDeviceSubTensor(
TensorType& t,
typename TensorType::DataPtrType data)
: tensor_(t),
data_(data) {
}
/// The tensor we're referencing
TensorType& tensor_;
/// The start of our sub-region
typename TensorType::DataPtrType const data_;
};
} // namespace detail
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __forceinline__
detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
Dim - 1, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
*this, data_)[index]);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __forceinline__
const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
Dim - 1, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
const_cast<TensorType&>(*this), data_)[index]);
}
#include "THCDeviceTensor-inl.cuh"
#endif // THC_DEVICE_TENSOR_INC
namespace detail {
// Add a layer of SFINAE to support static_assert
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct UpcastTHCRoot {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t);
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct UpcastTHC :
UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
};
// Never instantiated SFINAE purposes only
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true> {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t) {
thc_static_assert(NewDim > Dim);
return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
template upcastOuter<NewDim>();
}
};
// Add a layer of SFINAE to support static_assert
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct DowncastTHCRoot {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t);
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct DowncastTHC :
DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
};
// Never instantiated SFINAE purposes only
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true> {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t) {
thc_static_assert(NewDim < Dim);
return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
template downcastOuter<NewDim>();
}
};
} // namespace detail
#define SWITCH_UNROLL_CUDA_CAST_FACTORY(i) \
case i: \
if (NewDim > i) { \
return detail::UpcastTHC<T, i, IndexT, \
PtrTraits, NewDim, (NewDim > i)>:: \
make(state, t); \
} else if (NewDim == i) { \
return toDeviceTensor<T, NewDim, IndexT, PtrTraits>(state, t); \
} else { \
return detail::DowncastTHC<T, i, IndexT, \
PtrTraits, NewDim, (NewDim < i)>:: \
make(state, t); \
} \
/* break; */
template <typename T, int NewDim,
typename IndexT, template <typename U> class PtrTraits>
THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
toDeviceTensorCast(THCState* state, THCudaTensor* t) {
switch (THCudaTensor_nDimension(state, t)) {
SWITCH_UNROLL_CUDA_CAST_FACTORY(1);
SWITCH_UNROLL_CUDA_CAST_FACTORY(2);
SWITCH_UNROLL_CUDA_CAST_FACTORY(3);
SWITCH_UNROLL_CUDA_CAST_FACTORY(4);
SWITCH_UNROLL_CUDA_CAST_FACTORY(5);
SWITCH_UNROLL_CUDA_CAST_FACTORY(6);
SWITCH_UNROLL_CUDA_CAST_FACTORY(7);
SWITCH_UNROLL_CUDA_CAST_FACTORY(8);
SWITCH_UNROLL_CUDA_CAST_FACTORY(9);
SWITCH_UNROLL_CUDA_CAST_FACTORY(10);
default:
;
}
// Not implemented
THError("THCDeviceTensor dimension size not supported");
return NULL; /* never enters this piece, appeasing compiler warnings */
}
#undef SWITCH_UNROLL_CUDA_CAST_FACTORY
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment