Commit 8f8fbb9f authored by Hang Zhang's avatar Hang Zhang
Browse files

v1.0.1

parent aa9af7fd
from .resnet import *
from .densenet import *
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo
from collections import OrderedDict
from ..nn import DilatedAvgPool2d
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
model_urls = {
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
}
def densenet121(pretrained=False, **kwargs):
r"""Densenet-121 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet121']))
return model
def densenet169(pretrained=False, **kwargs):
r"""Densenet-169 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet169']))
return model
def densenet201(pretrained=False, **kwargs):
r"""Densenet-201 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet201']))
return model
def densenet161(pretrained=False, **kwargs):
r"""Densenet-161 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet161']))
return model
class _DenseLayer(nn.Sequential):
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, dilation=1):
super(_DenseLayer, self).__init__()
self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
self.add_module('relu.1', nn.ReLU(inplace=True)),
self.add_module('conv.1', nn.Conv2d(num_input_features, bn_size *
growth_rate, kernel_size=1, stride=1, bias=False)),
self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
self.add_module('relu.2', nn.ReLU(inplace=True)),
self.add_module('conv.2', nn.Conv2d(bn_size * growth_rate, growth_rate,
kernel_size=3, stride=1, padding=dilation, dilation=dilation,
bias=False)),
self.drop_rate = drop_rate
def forward(self, x):
new_features = super(_DenseLayer, self).forward(x)
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
return torch.cat([x, new_features], 1)
class _DenseBlock(nn.Sequential):
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, dilation=1):
super(_DenseBlock, self).__init__()
for i in range(num_layers):
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate, dilation=dilation)
self.add_module('denselayer%d' % (i + 1), layer)
class _Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features, stride, dilation=1):
super(_Transition, self).__init__()
self.add_module('norm', nn.BatchNorm2d(num_input_features))
self.add_module('relu', nn.ReLU(inplace=True))
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
kernel_size=1, stride=1, bias=False))
self.add_module('pool', DilatedAvgPool2d(kernel_size=2, stride=stride,
dilation=dilation))
class DenseNet(nn.Module):
r"""Dilated Densenet-BC model class
Args:
growth_rate (int) - how many filters to add each layer (`k` in paper)
block_config (list of 4 ints) - how many layers in each pooling block
num_init_features (int) - the number of filters to learn in the first convolution layer
bn_size (int) - multiplicative factor for number of bottle neck layers
(i.e. bn_size * k features in the bottleneck layer)
drop_rate (float) - dropout rate after each dense layer
num_classes (int) - number of classification classes
Reference:
Huang, Gao, et al. "Densely Connected Convolutional Networks" *CVPR 2017*
"""
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
super(DenseNet, self).__init__()
# First convolution
self.features = nn.Sequential(OrderedDict([
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
('norm0', nn.BatchNorm2d(num_init_features)),
('relu0', nn.ReLU(inplace=True)),
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
]))
# Each denseblock
strides = [1,2,1,1]
dilations = [1,1,2,4]
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate,
dilation=dilations[i])
self.features.add_module('denseblock%d' % (i + 1), block)
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1:
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2, stride=strides[i+1], dilation=dilations[i])
self.features.add_module('transition%d' % (i + 1), trans)
num_features = num_features // 2
# Final batch norm
self.features.add_module('norm5', nn.BatchNorm2d(num_features))
# Linear layer
self.classifier = nn.Linear(num_features, num_classes)
def forward(self, x):
features = self.features(x)
out = F.relu(features, inplace=True)
"""
out = F.avg_pool2d(out, kernel_size=7).view(features.size(0), -1)
out = self.classifier(out)
"""
return out
from .. import nn
import math
from torch.autograd import Variable
import torch.utils.model_zoo as model_zoo
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
padding=fist_dilation, dilation=fist_dilation, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.dilation = dilation
self.stride = stride
def _sum_each(self, x, y):
assert(len(x)==len(y))
z = []
for i in range(len(x)):
z.append(x[i]+y[i])
return z
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
if isinstance(out, Variable):
out += residual
elif isinstance(out, tuple) or isinstance(out, list):
out = self._sum_each(out, residual)
out = self.relu(out)
return out
class ResNet(nn.Module):
"""Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.
Reference:
Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
"""
def __init__(self, block, layers, num_classes=1000):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4)
self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
if dilation == 1 or dilation == 2:
layers.append(block(self.inplanes, planes, stride, dilation=1,
downsample=downsample, fist_dilation=dilation))
elif dilation ==4:
layers.append(block(self.inplanes, planes, stride, dilation=2,
downsample=downsample, fist_dilation=dilation))
else:
raise RuntimeError("=> unknown dilation size: {}".format(dilation))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, dilation=dilation, fist_dilation=dilation))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
def resnet34(pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
return model
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
def resnet101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
return model
def resnet152(pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
return model
if __name__ == "__main__":
model = ResNet(Bottleneck, [3, 4, 23, 3])
model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
print(model.layer4)
from .aggregate import aggregate, scaledL2, aggregateP, residual, square_squeeze, assign
from .encoding import *
from .basic import *
from .syncbn import *
from .customize import *
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
__all__ = ['view_each', 'multi_each', 'sum_each', 'upsample']
def view_each(x, size):
y = []
for i in range(len(x)):
y.append(x[i].view(size))
return y
def multi_each(a, b):
y = []
for i in range(len(a)):
y.append(a[i] * b[i])
return y
def sum_each(x, y):
assert(len(x)==len(y))
z = []
for i in range(len(x)):
z.append(x[i]+y[i])
return z
def upsample(input, size=None, scale_factor=None, mode='nearest'):
if isinstance(input, Variable):
return F.upsample(input, size=size, scale_factor=scale_factor,
mode=mode)
elif isinstance(input, tuple) or isinstance(input, list):
lock = threading.Lock()
results = {}
def _worker(i, x):
try:
with torch.cuda.device_of(x):
result = F.upsample(x, size=size, \
scale_factor=scale_factor,mode=mode)
with lock:
results[i] = result
except Exception as e:
with lock:
resutls[i] = e
# multi-threading for different gpu
threads = [threading.Thread(target=_worker,
args=(i, x),
)
for i, (x) in enumerate(input)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
# gather the results
def _list_gather(x):
y = []
for i in range(len(x)):
xi = x[i]
if isinstance(xi, Exception):
raise xi
y.append(xi)
return y
outputs = _list_gather(results)
return outputs
else:
raise RuntimeError('unknown input type')
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import math
import threading
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function, Variable
from torch.nn.modules.utils import _single, _pair, _triple
from .._ext import encoding_lib
__all__ = ['dilatedavgpool2d']
class _dilatedavgpool2d(Function):
def forward(self, input, kernel_size, stride, padding,
dilation=1):
self.kH, self.kW = _pair(kernel_size)
self.dH, self.dW = _pair(stride if stride is not None else
kernel_size)
self.padH, self.padW = _pair(padding)
self.dilationH, self.dilationW = _pair(dilation)
b,c,h,w = input.size()
if self.dH==1 and self.dW==1:
# keep the size for dilated avgpool
ow, oh = w, h
else:
ow = math.floor(float(w-self.kW+2*self.padW)/float(self.dW)) +1
oh = math.floor(float(h-self.kH+2*self.padH)/float(self.dH)) +1
output = input.new(b,c,oh,ow)
self.save_for_backward(input)
encoding_lib.Encoding_Float_DilatedAvgPool2d_Forward(input, output,
self.kH, self.kW, self.dH, self.dW, self.padH, self.padW,
self.dilationH, self.dilationW)
return output
def backward(self, gradOutput):
input, = self.saved_variables
gradInput = input.new().resize_as_(input)
encoding_lib.Encoding_Float_DilatedAvgPool2d_Backward(
gradinput, gradoutput,
self.kH, self.kW, self.dH, self.dW, self.padH, self.padW,
self.dilationH, self.dilationW)
return gradInput, None, None, None, None
def dilatedavgpool2d(input, kernel_size, stride=None, padding=0,
dilation=1):
"""Dilated Average Pool 2d, for dilation of DenseNet.
Applies 2D average-pooling operation in kh x kw regions by step size
dh x dw steps. The number of output features is equal to the number of
input planes.
See :class:`~encoding.nn.DilatedAvgPool2d` for details and output shape.
Args:
input: input tensor (minibatch x in_channels x iH x iW)
kernel_size: size of the pooling region, a single number or a
tuple (kh x kw)
stride: stride of the pooling operation, a single number or a
tuple (sh x sw). Default is equal to kernel size
padding: implicit zero padding on the input, a single number or
a tuple (padh x padw), Default: 0
dilation: the dilation parameter similar to Conv2d
"""
return _dilatedavgpool2d.apply(input, kernel_size, stride, padding,
dilation)
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import threading
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function, Variable
from .._ext import encoding_lib
__all__ = ['aggregate', 'scaledL2', 'aggregateP', 'residual', 'assign']
class _aggregate(Function):
def forward(self, A, X, C):
# A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
self.save_for_backward(A, X, C)
B, N, K = A.size()
D = X.size(2)
with torch.cuda.device_of(A):
E = A.new(B,K,D)
if isinstance(A, torch.cuda.FloatTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregateE_forward(E, A, X, C)
elif isinstance(A, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregateE_forward(E, A, X, C)
else:
raise RuntimeError('Unimplemented data type!')
return E
def backward(self, gradE):
A, X, C = self.saved_tensors
with torch.cuda.device_of(A):
gradA = A.new().resize_as_(A)
gradX = A.new().resize_as_(X)
gradC = A.new().resize_as_(C)
if isinstance(A, torch.cuda.FloatTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregateE_backward(gradA,
gradE, A, X, C)
elif isinstance(A, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregateE_backward(gradA,
gradE, A, X, C)
else:
raise RuntimeError('Unimplemented data type!')
gradX.copy_(torch.bmm(A, gradE))
gradC.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0))
return gradA, gradX, gradC
def aggregate(A, X, C):
r"""
Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect to the codewords (:math:`C`) with assignment weights (:math:`A`).
.. math::
e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k)
Shape:
- Input: :math:`A\in\mathcal{R}^{B\times N\times K}` :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
- Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
Examples:
>>> B,N,K,D = 2,3,4,5
>>> A = Variable(torch.cuda.DoubleTensor(B,N,K).uniform_(-0.5,0.5), requires_grad=True)
>>> X = Variable(torch.cuda.DoubleTensor(B,N,D).uniform_(-0.5,0.5), requires_grad=True)
>>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True)
>>> func = encoding.aggregate()
>>> E = func(A, X, C)
"""
return _aggregate()(A, X, C)
class _scaledL2(Function):
def forward(self, X, C, S):
B,N,D = X.size()
K = C.size(0)
with torch.cuda.device_of(X):
SL = X.new(B,N,K)
if isinstance(X, torch.cuda.FloatTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
elif isinstance(X, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
else:
raise RuntimeError('Unimplemented data type!')
self.save_for_backward(X, C, S, SL)
return SL
def backward(self, gradSL):
X, C, S, SL = self.saved_tensors
K = C.size(0)
with torch.cuda.device_of(X):
gradX = X.new().resize_as_(X)
gradC = X.new().resize_as_(C)
gradS = X.new().resize_as_(S)
if isinstance(X, torch.cuda.FloatTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Float_scaledl2_backward(gradSL,
gradX, gradC, X, C, S)
elif isinstance(X, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Double_scaledl2_backward(gradSL,
gradX, gradC, X, C, S)
else:
raise RuntimeError('Unimplemented data type!')
gradS.copy_((gradSL*(SL/S.view(1,1,K))).sum(0).sum(0))
return gradX, gradC, gradS
def scaledL2(X, C, S):
r"""
scaledL2 distance
.. math::
sl_{ik} = s_k \|x_i-c_k\|^2
Shape:
- Input: :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` :math:`S\in \mathcal{R}^K` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
- Output: :math:`E\in\mathcal{R}^{B\times N\times K}`
"""
return _scaledL2()(X, C, S)
class _aggregateP(Function):
def forward(self, A, R):
# A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
self.save_for_backward(A, R)
B, N, K, D = R.size()
with torch.cuda.device_of(A):
E = A.new(B,K,D)
if isinstance(A, torch.cuda.FloatTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregate_forward(E, A, R)
elif isinstance(A, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregate_forward(E, A, R)
else:
raise RuntimeError('Unimplemented data type!')
return E
def backward(self, gradE):
A, R = self.saved_tensors
with torch.cuda.device_of(A):
gradA = A.new().resize_as_(A)
gradR = R.new().resize_as_(R)
if isinstance(A, torch.cuda.FloatTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregate_backward(gradA,
gradR, gradE, A, R)
elif isinstance(A, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregate_backward(gradA,
gradR, gradE, A, R)
else:
raise RuntimeError('Unimplemented data type!')
return gradA, gradR
def aggregateP(A, R):
return _aggregateP()(A, R)
class _residual(Function):
def forward(self, X, C):
# X \in(BxNxD) D \in(KxD) R \in(BxNxKxD)
B, N, D = X.size()
K = C.size(0)
with torch.cuda.device_of(X):
R = X.new(B,N,K,D)
if isinstance(X, torch.cuda.FloatTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Float_residual_forward(R, X, C)
elif isinstance(X, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Double_residual_forward(R, X, C)
else:
raise RuntimeError('Unimplemented data type!')
return R
def backward(self, gradR):
B, N, K, D = gradR.size()
with torch.cuda.device_of(gradR):
gradX = gradR.new(B,N,D)
gradD = gradR.new(K,D)
if isinstance(gradR, torch.cuda.FloatTensor):
with torch.cuda.device_of(gradR):
encoding_lib.Encoding_Float_residual_backward(gradR,
gradX, gradD)
elif isinstance(gradR, torch.cuda.DoubleTensor):
with torch.cuda.device_of(gradR):
encoding_lib.Encoding_Double_residual_backward(gradR,
gradX, gradD)
else:
raise RuntimeError('Unimplemented data type!')
return gradX, gradD
def residual(X, C):
r"""
Calculate residuals over a mini-batch
.. math::
r_{ik} = x_i - c_k
Shape:
- Input: :math:`X\in\mathcal{R}^{B\times N\times D}` :math:`C\in\mathcal{R}^{K\times D}` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
- Output: :math:`R\in\mathcal{R}^{B\times N\times K\times D}`
"""
return _residual()(X, C)
class _square_squeeze(Function):
def forward(self, R):
B, N, K, D = R.size()
with torch.cuda.device_of(R):
L = R.new(B,N,K)
if isinstance(R, torch.cuda.FloatTensor):
with torch.cuda.device_of(R):
encoding_lib.Encoding_Float_squaresqueeze_forward(L, R)
elif isinstance(R, torch.cuda.DoubleTensor):
with torch.cuda.device_of(R):
encoding_lib.Encoding_Double_squaresqueeze_forward(L, R)
else:
raise RuntimeError('Unimplemented data type!')
self.save_for_backward(L, R)
return L
def backward(self, gradL):
L, R = self.saved_tensors
B, N, K, D = R.size()
with torch.cuda.device_of(R):
gradR = R.new(B,N,K,D)
if isinstance(R, torch.cuda.FloatTensor):
with torch.cuda.device_of(gradL):
encoding_lib.Encoding_Float_squaresqueeze_backward(gradL,
gradR, R)
elif isinstance(R, torch.cuda.DoubleTensor):
with torch.cuda.device_of(gradL):
encoding_lib.Encoding_Double_squaresqueeze_backward(gradL,
gradR, R)
else:
raise RuntimeError('Unimplemented data type!')
return gradR
def assign(R, S):
r"""
Calculate assignment weights for given residuals (:math:`R`) and scale (:math:`S`)
.. math::
a_{ik} = \frac{exp(-s_k\|r_{ik}\|^2)}{\sum_{j=1}^K exp(-s_j\|r_{ik}\|^2)}
Shape:
- Input: :math:`R\in\mathcal{R}^{B\times N\times K\times D}` :math:`S\in \mathcal{R}^K` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
- Output :math:`A\in\mathcal{R}^{B\times N\times K}`
"""
L = _square_squeeze()(R)
K = S.size(0)
SL = L * S.view(1,1,K)
return F.softmax(SL)
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import threading
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function, Variable
from .._ext import encoding_lib
__all__ = ['sum_square', 'batchnormtrain', 'batchnormeval']
class _sum_square(Function):
def forward(ctx, input):
ctx.save_for_backward(input)
B,C,H,W = input.size()
with torch.cuda.device_of(input):
xsum = input.new().resize_(C).zero_()
xsquare = input.new().resize_(C).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_sum_square_Forward(
input.view(B,C,-1), xsum, xsquare)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_sum_square_Forward(
input.view(B,C,-1), xsum, xsquare)
else:
raise RuntimeError('Unimplemented data type!')
return xsum, xsquare
def backward(ctx, gradSum, gradSquare):
input, = ctx.saved_tensors
B,C,H,W = input.size()
with torch.cuda.device_of(input):
gradInput = input.new().resize_(B,C,H*W).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_sum_square_Backward(
gradInput, input.view(B,C,-1), gradSum, gradSquare)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_sum_square_Backward(
gradInput, input.view(B,C,-1), gradSum, gradSquare)
else:
raise RuntimeError('Unimplemented data type!')
return gradInput.view(B,C,H,W)
def sum_square(input):
r"""
Calculate sum of elements and sum of squares for Batch Normalization.
"""
return _sum_square()(input)
class _batchnormtrain(Function):
def forward(ctx, input, gamma, beta, mean, std):
ctx.save_for_backward(input, gamma, beta, mean, std)
assert(input.dim()==3)
with torch.cuda.device_of(input):
invstd = 1.0 / std
output = input.new().resize_as_(input)
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_batchnorm_Forward(output,
input, mean, invstd, gamma, beta)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_batchnorm_Forward(output,
input, mean, invstd, gamma, beta)
else:
raise RuntimeError('Unimplemented data type!')
return output
def backward(ctx, gradOutput):
input, gamma, beta, mean, std = ctx.saved_tensors
invstd = 1.0 / std
with torch.cuda.device_of(input):
gradInput = gradOutput.new().resize_as_(input).zero_()
gradGamma = gradOutput.new().resize_as_(gamma).zero_()
gradBeta = gradOutput.new().resize_as_(beta).zero_()
gradMean = gradOutput.new().resize_as_(mean).zero_()
gradStd = gradOutput.new().resize_as_(std).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_batchnorm_Backward(
gradOutput, input, gradInput, gradGamma, gradBeta,
mean, invstd, gamma, beta, gradMean, gradStd,
True)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_batchnorm_Backward(
gradOutput, input, gradInput, gradGamma, gradBeta,
mean, invstd, gamma, beta, gradMean, gradStd,
True)
else:
raise RuntimeError('Unimplemented data type!')
return gradInput, gradGamma, gradBeta, gradMean, gradStd
def batchnormtrain(input, gamma, beta, mean, std):
r"""Applies Batch Normalization over a 3d input that is seen as a
mini-batch.
.. _encoding.batchnormtrain:
.. math::
y = \frac{x - \mu[x]}{ \sqrt{var[x] + \epsilon}} * \gamma + \beta
Shape:
- Input: :math:`(N, C)` or :math:`(N, C, L)`
- Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
"""
return _batchnormtrain()(input, gamma, beta, mean, std)
class _batchnormeval(Function):
def forward(ctx, input, gamma, beta, mean, std):
ctx.save_for_backward(input, gamma, beta, mean, std)
assert(input.dim()==3)
with torch.cuda.device_of(input):
invstd = 1.0 / std
output = input.new().resize_as_(input)
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_batchnorm_Forward(output,
input, mean, invstd, gamma, beta)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_batchnorm_Forward(output,
input, mean, invstd, gamma, beta)
else:
raise RuntimeError('Unimplemented data type!')
return output
def backward(ctx, gradOutput):
input, gamma, beta, mean, std = ctx.saved_tensors
invstd = 1.0 / std
with torch.cuda.device_of(input):
gradInput = gradOutput.new().resize_as_(input).zero_()
gradGamma = gradOutput.new().resize_as_(gamma).zero_()
gradBeta = gradOutput.new().resize_as_(beta).zero_()
gradMean = gradOutput.new().resize_as_(mean).zero_()
gradStd = gradOutput.new().resize_as_(std).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_batchnorm_Backward(
gradOutput, input, gradInput, gradGamma, gradBeta,
mean, invstd, gamma, beta, gradMean, gradStd,
False)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_batchnorm_Backward(
gradOutput, input, gradInput, gradGamma, gradBeta,
mean, invstd, gamma, beta, gradMean, gradStd,
False)
else:
raise RuntimeError('Unimplemented data type!')
return gradInput, gradGamma, gradBeta, gradMean, gradStd
def batchnormeval(input, gamma, beta, mean, std):
r"""Applies Batch Normalization over a 3d input that is seen as a
mini-batch.
Please see encoding.batchnormtrain_
"""
return _batchnormeval()(input, gamma, beta, mean, std)
......@@ -31,8 +31,8 @@ __global__ void Encoding_(AggregateE_Forward_kernel) (
k = blockIdx.y * blockDim.y + threadIdx.y;
N = A.getSize(1);
/* boundary check for output */
sum = 0;
if (d >= E.getSize(2) || k >= E.getSize(1)) return;
sum = 0;
/* main operation */
for(i=0; i<N; i++) {
sum += A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
......@@ -49,9 +49,9 @@ void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_,
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, E_, A_, X_, C_);
if (THCTensor_(nDimension)(state, E_) != 3 ||
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2)
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
......@@ -62,7 +62,7 @@ void Encoding_(AggregateE_Forward)(THCState *state, THCTensor *E_,
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(E.getSize(2)/16+1, E.getSize(1)/16+1,
E.getSize(0));
E.getSize(0));
Encoding_(AggregateE_Forward_kernel)<<<blocks, threads, 0, stream>>>
(E, A, X, C);
THCudaCheck(cudaGetLastError());
......@@ -527,491 +527,5 @@ void Encoding_(ScaledL2_Backward)(
(GSL, GC, X, C, S);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(SquareSqueeze_Forward_kernel) (
THCDeviceTensor<real, 3> L,
THCDeviceTensor<real, 4> R)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, i, D;
real sum;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
D = R.getSize(3);
/* boundary check for output */
if (k >= L.getSize(2) || i >= L.getSize(1)) return;
/* main operation */
sum = 0;
for(d=0; d<D; d++) {
sum += R[b][i][k][d].ldg()*R[b][i][k][d].ldg();
}
L[b][i][k] = sum;
}
void Encoding_(SquareSqueeze_Forward)(
THCState *state, THCTensor *L_, THCTensor *R_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, L_, R_);
if (THCTensor_(nDimension)(state, L_) != 3 ||
THCTensor_(nDimension)(state, R_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> L = devicetensor<3>(state, L_);
THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(L.getSize(2)/16+1, L.getSize(1)/16+1,
L.getSize(0));
Encoding_(SquareSqueeze_Forward_kernel)<<<blocks, threads, 0, stream>>>
(L, R);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(SquareSqueeze_Backward_kernel) (
THCDeviceTensor<real, 3> GL,
THCDeviceTensor<real, 4> GR,
THCDeviceTensor<real, 4> R)
/*
*/
{
/* declarations of the variables */
int b, k, d, i, D;
real scale;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
D = R.getSize(3);
/* boundary check for output */
if (k >= R.getSize(2) || i >= R.getSize(1)) return;
/* main operation */
scale = GL[b][i][k] * 2;
for(d=0; d<D; d++) {
GR[b][i][k][d] = scale * R[b][i][k][d];
}
}
void Encoding_(SquareSqueeze_Backward)(
THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 3, GL_, GR_, R_);
if (THCTensor_(nDimension)(state, GL_) != 3 ||
THCTensor_(nDimension)(state, GR_) != 4 ||
THCTensor_(nDimension)(state, R_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GL = devicetensor<3>(state, GL_);
THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(R.getSize(2)/16+1, R.getSize(1)/16+1,
R.getSize(0));
Encoding_(SquareSqueeze_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GL, GR, R);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(BatchNorm_Forward_kernel) (
THCDeviceTensor<real, 3> output,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> mean,
THCDeviceTensor<real, 1> invstd,
THCDeviceTensor<real, 1> gamma,
THCDeviceTensor<real, 1> beta)
{
int c = blockIdx.x;
/* main operation */
for (int b = 0; b < input.getSize(0); ++b) {
for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
real inp = input[b][c][x].ldg();
output[b][c][x] = gamma[c].ldg() * (inp - mean[c].ldg()) *
invstd[c].ldg() + beta[c].ldg();
}
}
}
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_)
/*
* batch norm forward function
* assuming the input is already flaghten
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, output_, input_, mean_, invstd_,
gamma_, beta_);
if (THCTensor_(nDimension)(state, output_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, mean_) != 1 ||
THCTensor_(nDimension)(state, invstd_) != 1 ||
THCTensor_(nDimension)(state, gamma_) != 1 ||
THCTensor_(nDimension)(state, beta_) != 1)
THError("BatchNorm2d forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> output = devicetensor<3>(state, output_);
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> mean = devicetensor<1>(state, mean_);
THCDeviceTensor<real, 1> invstd = devicetensor<1>(state, invstd_);
THCDeviceTensor<real, 1> gamma = devicetensor<1>(state, gamma_);
THCDeviceTensor<real, 1> beta = devicetensor<1>(state, beta_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(BatchNorm_Forward_kernel)<<<blocks, threads, 0, stream>>>(
output, input, mean, invstd, gamma, beta);
THCudaCheck(cudaGetLastError());
}
struct Encoding_(Float2){
real v1, v2;
__device__ Encoding_(Float2)() {}
__device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
__device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
__device__ Encoding_(Float2)(int v) : v1(v), v2(v) {}
__device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a) {
v1 += a.v1;
v2 += a.v2;
return *this;
}
};
static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
#if __CUDA_ARCH__ >= 300
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += __shfl_xor(val, 1 << i, WARP_SIZE);
}
#else
__shared__ real values[MAX_BLOCK_SIZE];
values[threadIdx.x] = val;
__threadfence_block();
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
for (int i = 1; i < WARP_SIZE; i++) {
val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
}
#endif
return val;
}
static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(Encoding_(Float2) value) {
value.v1 = Encoding_(rwarpSum)(value.v1);
value.v2 = Encoding_(rwarpSum)(value.v2);
return value;
}
struct Encoding_(GradOp) {
__device__ Encoding_(GradOp)(real m, THCDeviceTensor<real, 3> i, THCDeviceTensor<real, 3> g)
: mean(m), input(i), gradOutput(g) {}
__device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
real g = gradOutput[batch][plane][n].ldg();
real c = input[batch][plane][n].ldg() - mean;
return Encoding_(Float2)(g, g * c);
}
real mean;
THCDeviceTensor<real, 3> input;
THCDeviceTensor<real, 3> gradOutput;
};
// Sum across (batch, x/y/z) applying Op() pointwise
__device__ Encoding_(Float2) Encoding_(reduce)(Encoding_(GradOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
Encoding_(Float2) sum = (Encoding_(Float2))0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = Encoding_(warpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ Encoding_(Float2) shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (Encoding_(Float2))0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(warpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
__global__ void Encoding_(BatchNorm_Backward_kernel) (
THCDeviceTensor<real, 3> gradoutput,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 3> gradinput,
THCDeviceTensor<real, 1> gradgamma,
THCDeviceTensor<real, 1> gradbeta,
THCDeviceTensor<real, 1> mean,
THCDeviceTensor<real, 1> invstd,
THCDeviceTensor<real, 1> gamma,
THCDeviceTensor<real, 1> beta,
THCDeviceTensor<real, 1> gradMean,
THCDeviceTensor<real, 1> gradStd,
int train)
{
/* declarations of the variables */
/* Get the index and channels */
int c = blockIdx.x;
/* main operation */
Encoding_(GradOp) g(mean[c], input, gradoutput);
Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
real gradOutputSum = res.v1;
real dotP = res.v2;
//real projScale = dotP * norm * invstd[c].ldg() * invstd[c].ldg();
real gradScale = invstd[c].ldg() * gamma[c].ldg();
if (train && threadIdx.x == 0) {
gradMean[c] = - gradOutputSum * gamma[c].ldg() * invstd[c].ldg();
gradStd[c] = - dotP * gamma[c].ldg() * invstd[c].ldg() * invstd[c].ldg();
}
if (gradinput.numElements() > 0) {
for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
gradinput[batch][c][x] = gradoutput[batch][c][x].ldg() * gradScale;
}
}
}
if (gradgamma.numElements() > 0) {
if (threadIdx.x == 0) {
gradgamma[c] += dotP * invstd[c].ldg();
}
}
if (gradbeta.numElements() > 0) {
if (threadIdx.x == 0) {
gradbeta[c] += gradOutputSum;
}
}
}
void Encoding_(BatchNorm_Backward)(THCState *state,
THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_,
THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_,
THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_,
THCTensor *gradMean_, THCTensor *gradStd_, int train)
/*
* batch norm backward function
* assuming the input is already flaghten
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, gradoutput_, input_, gradinput_,
gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_);
if (THCTensor_(nDimension)(state, gradoutput_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, gradinput_) != 3 ||
THCTensor_(nDimension)(state, gradgamma_) != 1 ||
THCTensor_(nDimension)(state, gradbeta_) != 1 ||
THCTensor_(nDimension)(state, mean_) != 1 ||
THCTensor_(nDimension)(state, invstd_) != 1 ||
THCTensor_(nDimension)(state, gamma_) != 1 ||
THCTensor_(nDimension)(state, beta_) != 1 ||
THCTensor_(nDimension)(state, gradMean_) != 1 ||
THCTensor_(nDimension)(state, gradStd_) != 1 )
THError("BatchNorm2d backward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> gradoutput =
devicetensor<3>(state, gradoutput_);
THCDeviceTensor<real, 3> input =
devicetensor<3>(state, input_);
THCDeviceTensor<real, 3> gradinput =
devicetensor<3>(state, gradinput_);
THCDeviceTensor<real, 1> gradgamma =
devicetensor<1>(state, gradgamma_);
THCDeviceTensor<real, 1> gradbeta = devicetensor<1>(state, gradbeta_);
THCDeviceTensor<real, 1> mean = devicetensor<1>(state, mean_);
THCDeviceTensor<real, 1> invstd = devicetensor<1>(state, invstd_);
THCDeviceTensor<real, 1> gamma = devicetensor<1>(state, gamma_);
THCDeviceTensor<real, 1> beta = devicetensor<1>(state, beta_);
THCDeviceTensor<real, 1> gradMean = devicetensor<1>(state, gradMean_);
THCDeviceTensor<real, 1> gradStd = devicetensor<1>(state, gradStd_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(BatchNorm_Backward_kernel)<<<blocks, threads, 0, stream>>>(
gradoutput, input, gradinput, gradgamma, gradbeta, mean, invstd,
gamma, beta, gradMean, gradStd, train);
THCudaCheck(cudaGetLastError());
}
struct Encoding_(SumOp) {
__device__ Encoding_(SumOp)(THCDeviceTensor<real, 3> i)
: input(i){}
__device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
real g = input[batch][plane][n].ldg();
return Encoding_(Float2)(g, g * g);
}
real mean;
THCDeviceTensor<real, 3> input;
};
// Sum across (batch, x/y/z) applying Op() pointwise
__device__ Encoding_(Float2) Encoding_(reduce_sum)(Encoding_(SumOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
Encoding_(Float2) sum = (Encoding_(Float2))0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = Encoding_(warpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ Encoding_(Float2) shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (Encoding_(Float2))0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(warpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
__global__ void Encoding_(Sum_Square_Forward_kernel) (
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> sum,
THCDeviceTensor<real, 1> square)
{
int c = blockIdx.x;
/* main operation */
Encoding_(SumOp) g(input);
Encoding_(Float2) res = Encoding_(reduce_sum)(g, input, c);
real xsum = res.v1;
real xsquare = res.v2;
if (threadIdx.x == 0) {
sum[c] = xsum;
square[c] = xsquare;
}
}
void Encoding_(Sum_Square_Forward)(THCState *state,
THCTensor *input_, THCTensor *sum_, THCTensor *square_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 3, input_, sum_, square_);
if (THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, sum_) != 1 ||
THCTensor_(nDimension)(state, square_) != 1)
THError("Sum_Square forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> sum = devicetensor<1>(state, sum_);
THCDeviceTensor<real, 1> square = devicetensor<1>(state, square_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(Sum_Square_Forward_kernel)<<<blocks, threads, 0, stream>>>(
input, sum, square);
THCudaCheck(cudaGetLastError());
}
__global__ void Encoding_(Sum_Square_Backward_kernel) (
THCDeviceTensor<real, 3> gradInput,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> gradSum,
THCDeviceTensor<real, 1> gradSquare)
{
int c = blockIdx.x;
/* main operation */
for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
{
gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
input[batch][c][x];
}
}
}
void Encoding_(Sum_Square_Backward)(THCState *state,
THCTensor *gradInput_, THCTensor *input_,
THCTensor *gradSum_, THCTensor *gradSquare_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, gradInput_, input_, gradSum_,
gradSquare_);
if (THCTensor_(nDimension)(state, gradInput_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, gradSum_) != 1 ||
THCTensor_(nDimension)(state, gradSquare_) != 1)
THError("Sum_Square forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> gradInput = devicetensor<3>(state, gradInput_);
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> gradSum = devicetensor<1>(state, gradSum_);
THCDeviceTensor<real, 1> gradSquare =devicetensor<1>(state, gradSquare_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(Sum_Square_Backward_kernel)<<<blocks, threads, 0, stream>>>(
gradInput, input, gradSum, gradSquare);
THCudaCheck(cudaGetLastError());
}
#endif
......@@ -38,28 +38,4 @@ void Encoding_(Residual_Forward)(
void Encoding_(Residual_Backward)(
THCState *state, THCTensor *GR_, THCTensor *GX_, THCTensor *GD_);
void Encoding_(SquareSqueeze_Forward)(
THCState *state, THCTensor *L_, THCTensor *R_);
void Encoding_(SquareSqueeze_Backward)(
THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_);
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_);
void Encoding_(BatchNorm_Backward)(THCState *state,
THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_,
THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_,
THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_,
THCTensor *gradMean_, THCTensor *gradStd_, int train);
void Encoding_(Sum_Square_Forward)(THCState *state,
THCTensor *input_, THCTensor *sum_, THCTensor *square_);
void Encoding_(Sum_Square_Backward)(THCState *state,
THCTensor *gradInput, THCTensor *input_,
THCTensor *gradSum_, THCTensor *gradSquare_);
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/pooling_kernel.c"
#else
__global__ void Encoding_(DilatedAvgPool_Forward_kernel) (
THCDeviceTensor<real, 4> X,
THCDeviceTensor<real, 4> Y,
int kH, int kW, int dH, int dW,
int padH, int padW, int dilationH, int dilationW
)
/*
* dilated avgpool2d forward kernel function
*/
{
/* declarations of the variables */
int bc, b, c, w, h, C;
real sum;
/* Get the index and channels */
bc = blockIdx.z;
w = blockIdx.x * blockDim.x + threadIdx.x;
h = blockIdx.y * blockDim.y + threadIdx.y;
C = Y.getSize(1);
b = bc / C;
c = bc - b*C;
/* boundary check for output */
if (w >= Y.getSize(3) || h >= Y.getSize(2)) return;
int hstart = h*dW -padH;
int wstart = w*dW -padW;
int hend = min(hstart + kH*dilationH, X.getSize(2));
int wend = min(wstart + kW*dilationW, X.getSize(3));
hstart = max(hstart, 0);
wstart = max(wstart, 0);
int pool_size = ((hend - hstart - 1) / dilationH + 1) *
((wend - wstart - 1) / dilationW + 1);
sum = 0;
for (int th=hstart; th < hend; th+=dilationH) {
for (int tw=wstart; tw < wend; tw+=dilationW) {
sum += X[b][c][th][tw];
}
}
Y[b][c][h][w] = sum / pool_size;
}
void Encoding_(DilatedAvgPool_Forward)(THCState *state,
THCTensor *X_, THCTensor *Y_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW)
/*
* dilated avgpool2d forward function
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, X_, Y_);
if (THCTensor_(nDimension)(state, X_) != 4 ||
THCTensor_(nDimension)(state, Y_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 4> X = devicetensor<4>(state, X_);
THCDeviceTensor<real, 4> Y = devicetensor<4>(state, Y_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(Y.getSize(3)/16+1, Y.getSize(2)/16+1,
Y.getSize(1)*Y.getSize(0));
Encoding_(DilatedAvgPool_Forward_kernel)<<<blocks, threads, 0, stream>>>
(X, Y, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
THCudaCheck(cudaGetLastError());
}
__global__ void Encoding_(DilatedAvgPool_Backward_kernel) (
THCDeviceTensor<real, 4> gradX,
THCDeviceTensor<real, 4> gradY,
int kH, int kW, int dH, int dW,
int padH, int padW, int dilationH, int dilationW
)
/*
* dilated avgpool2d forward kernel function
*/
{
/* declarations of the variables */
int bc, b, c, w, h, C;
real sum;
/* Get the index and channels */
bc = blockIdx.z;
w = blockIdx.x * blockDim.x + threadIdx.x;
h = blockIdx.y * blockDim.y + threadIdx.y;
C = gradX.getSize(1);
b = bc / C;
c = bc - b*C;
/* boundary check for output */
if (w >= gradX.getSize(3) || h >= gradX.getSize(2)) return;
int phstart = (h + padH < ((kH-1)*dilationH+1)) ? 0 :
(h + padH - ((kH-1)*dilationH+1))/dH + 1;
int pwstart = (w + padW < ((kW-1)*dilationW+1)) ? 0 :
(w + padW - ((kW-1)*dilationW+1))/dW + 1;
int phend = min((h+padH)/dH+1, gradY.getSize(2));
int pwend = min((w+padW)/dW+1, gradY.getSize(3));
sum = 0;
int hstart, wstart, hend, wend, pool_size;
for (int ph=phstart; ph < phend; ++ph) {
for (int pw=pwstart; pw < pwend; ++pw) {
hstart = ph*dW -padH;
wstart = pw*dW -padW;
hend = min(hstart + kH*dilationH, gradX.getSize(2));
wend = min(wstart + kW*dilationW, gradX.getSize(3));
hstart = max(hstart, 0);
wstart = max(wstart, 0);
pool_size = ((hend - hstart - 1) / dilationH + 1) *
((wend - wstart - 1) / dilationW + 1);
sum += gradY[b][c][ph][pw] / pool_size;
}
}
gradX[b][c][h][w] = sum;
}
void Encoding_(DilatedAvgPool_Backward)(THCState *state,
THCTensor *gradX_, THCTensor *gradY_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW)
/*
* dilated avgpool2d forward function
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, gradX_, gradY_);
if (THCTensor_(nDimension)(state, gradX_) != 4 ||
THCTensor_(nDimension)(state, gradY_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 4> gradX = devicetensor<4>(state, gradX_);
THCDeviceTensor<real, 4> gradY = devicetensor<4>(state, gradY_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(gradX.getSize(3)/16+1, gradX.getSize(2)/16+1,
gradX.getSize(1)*gradX.getSize(0));
Encoding_(DilatedAvgPool_Backward_kernel)<<<blocks, threads, 0, stream>>>
(gradX, gradY, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
THCudaCheck(cudaGetLastError());
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/pooling_kernel.h"
#else
void Encoding_(DilatedAvgPool_Forward)(THCState *state,
THCTensor *X_, THCTensor *Y_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
void Encoding_(DilatedAvgPool_Backward)(THCState *state,
THCTensor *gradX_, THCTensor *gradY_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/syncbn_kernel.c"
#else
__global__ void Encoding_(SquareSqueeze_Forward_kernel) (
THCDeviceTensor<real, 3> L,
THCDeviceTensor<real, 4> R)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, i, D;
real sum;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
D = R.getSize(3);
/* boundary check for output */
if (k >= L.getSize(2) || i >= L.getSize(1)) return;
/* main operation */
sum = 0;
for(d=0; d<D; d++) {
sum += R[b][i][k][d].ldg()*R[b][i][k][d].ldg();
}
L[b][i][k] = sum;
}
void Encoding_(SquareSqueeze_Forward)(
THCState *state, THCTensor *L_, THCTensor *R_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, L_, R_);
if (THCTensor_(nDimension)(state, L_) != 3 ||
THCTensor_(nDimension)(state, R_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> L = devicetensor<3>(state, L_);
THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(L.getSize(2)/16+1, L.getSize(1)/16+1,
L.getSize(0));
Encoding_(SquareSqueeze_Forward_kernel)<<<blocks, threads, 0, stream>>>
(L, R);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(SquareSqueeze_Backward_kernel) (
THCDeviceTensor<real, 3> GL,
THCDeviceTensor<real, 4> GR,
THCDeviceTensor<real, 4> R)
/*
*/
{
/* declarations of the variables */
int b, k, d, i, D;
real scale;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
D = R.getSize(3);
/* boundary check for output */
if (k >= R.getSize(2) || i >= R.getSize(1)) return;
/* main operation */
scale = GL[b][i][k] * 2;
for(d=0; d<D; d++) {
GR[b][i][k][d] = scale * R[b][i][k][d];
}
}
void Encoding_(SquareSqueeze_Backward)(
THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 3, GL_, GR_, R_);
if (THCTensor_(nDimension)(state, GL_) != 3 ||
THCTensor_(nDimension)(state, GR_) != 4 ||
THCTensor_(nDimension)(state, R_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GL = devicetensor<3>(state, GL_);
THCDeviceTensor<real, 4> GR = devicetensor<4>(state, GR_);
THCDeviceTensor<real, 4> R = devicetensor<4>(state, R_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(R.getSize(2)/16+1, R.getSize(1)/16+1,
R.getSize(0));
Encoding_(SquareSqueeze_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GL, GR, R);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(BatchNorm_Forward_kernel) (
THCDeviceTensor<real, 3> output,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> mean,
THCDeviceTensor<real, 1> invstd,
THCDeviceTensor<real, 1> gamma,
THCDeviceTensor<real, 1> beta)
{
int c = blockIdx.x;
/* main operation */
for (int b = 0; b < input.getSize(0); ++b) {
for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
real inp = input[b][c][x].ldg();
output[b][c][x] = gamma[c].ldg() * (inp - mean[c].ldg()) *
invstd[c].ldg() + beta[c].ldg();
}
}
}
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_)
/*
* batch norm forward function
* assuming the input is already flaghten
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, output_, input_, mean_, invstd_,
gamma_, beta_);
if (THCTensor_(nDimension)(state, output_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, mean_) != 1 ||
THCTensor_(nDimension)(state, invstd_) != 1 ||
THCTensor_(nDimension)(state, gamma_) != 1 ||
THCTensor_(nDimension)(state, beta_) != 1)
THError("BatchNorm2d forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> output = devicetensor<3>(state, output_);
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> mean = devicetensor<1>(state, mean_);
THCDeviceTensor<real, 1> invstd = devicetensor<1>(state, invstd_);
THCDeviceTensor<real, 1> gamma = devicetensor<1>(state, gamma_);
THCDeviceTensor<real, 1> beta = devicetensor<1>(state, beta_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(BatchNorm_Forward_kernel)<<<blocks, threads, 0, stream>>>(
output, input, mean, invstd, gamma, beta);
THCudaCheck(cudaGetLastError());
}
struct Encoding_(Float2){
real v1, v2;
__device__ Encoding_(Float2)() {}
__device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
__device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
__device__ Encoding_(Float2)(int v) : v1(v), v2(v) {}
__device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a) {
v1 += a.v1;
v2 += a.v2;
return *this;
}
};
static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
#if __CUDA_ARCH__ >= 300
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += __shfl_xor(val, 1 << i, WARP_SIZE);
}
#else
__shared__ real values[MAX_BLOCK_SIZE];
values[threadIdx.x] = val;
__threadfence_block();
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
for (int i = 1; i < WARP_SIZE; i++) {
val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
}
#endif
return val;
}
static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(Encoding_(Float2) value) {
value.v1 = Encoding_(rwarpSum)(value.v1);
value.v2 = Encoding_(rwarpSum)(value.v2);
return value;
}
struct Encoding_(GradOp) {
__device__ Encoding_(GradOp)(real m, THCDeviceTensor<real, 3> i, THCDeviceTensor<real, 3> g)
: mean(m), input(i), gradOutput(g) {}
__device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
real g = gradOutput[batch][plane][n].ldg();
real c = input[batch][plane][n].ldg() - mean;
return Encoding_(Float2)(g, g * c);
}
real mean;
THCDeviceTensor<real, 3> input;
THCDeviceTensor<real, 3> gradOutput;
};
// Sum across (batch, x/y/z) applying Op() pointwise
__device__ Encoding_(Float2) Encoding_(reduce)(Encoding_(GradOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
Encoding_(Float2) sum = (Encoding_(Float2))0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = Encoding_(warpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ Encoding_(Float2) shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (Encoding_(Float2))0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(warpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
__global__ void Encoding_(BatchNorm_Backward_kernel) (
THCDeviceTensor<real, 3> gradoutput,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 3> gradinput,
THCDeviceTensor<real, 1> gradgamma,
THCDeviceTensor<real, 1> gradbeta,
THCDeviceTensor<real, 1> mean,
THCDeviceTensor<real, 1> invstd,
THCDeviceTensor<real, 1> gamma,
THCDeviceTensor<real, 1> beta,
THCDeviceTensor<real, 1> gradMean,
THCDeviceTensor<real, 1> gradStd,
int train)
{
/* declarations of the variables */
/* Get the index and channels */
int c = blockIdx.x;
/* main operation */
Encoding_(GradOp) g(mean[c], input, gradoutput);
Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
real gradOutputSum = res.v1;
real dotP = res.v2;
//real projScale = dotP * norm * invstd[c].ldg() * invstd[c].ldg();
real gradScale = invstd[c].ldg() * gamma[c].ldg();
if (train && threadIdx.x == 0) {
gradMean[c] = - gradOutputSum * gamma[c].ldg() * invstd[c].ldg();
gradStd[c] = - dotP * gamma[c].ldg() * invstd[c].ldg() * invstd[c].ldg();
}
if (gradinput.numElements() > 0) {
for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
gradinput[batch][c][x] = gradoutput[batch][c][x].ldg() * gradScale;
}
}
}
if (gradgamma.numElements() > 0) {
if (threadIdx.x == 0) {
gradgamma[c] += dotP * invstd[c].ldg();
}
}
if (gradbeta.numElements() > 0) {
if (threadIdx.x == 0) {
gradbeta[c] += gradOutputSum;
}
}
}
void Encoding_(BatchNorm_Backward)(THCState *state,
THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_,
THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_,
THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_,
THCTensor *gradMean_, THCTensor *gradStd_, int train)
/*
* batch norm backward function
* assuming the input is already flaghten
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, gradoutput_, input_, gradinput_,
gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_);
if (THCTensor_(nDimension)(state, gradoutput_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, gradinput_) != 3 ||
THCTensor_(nDimension)(state, gradgamma_) != 1 ||
THCTensor_(nDimension)(state, gradbeta_) != 1 ||
THCTensor_(nDimension)(state, mean_) != 1 ||
THCTensor_(nDimension)(state, invstd_) != 1 ||
THCTensor_(nDimension)(state, gamma_) != 1 ||
THCTensor_(nDimension)(state, beta_) != 1 ||
THCTensor_(nDimension)(state, gradMean_) != 1 ||
THCTensor_(nDimension)(state, gradStd_) != 1 )
THError("BatchNorm2d backward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> gradoutput =
devicetensor<3>(state, gradoutput_);
THCDeviceTensor<real, 3> input =
devicetensor<3>(state, input_);
THCDeviceTensor<real, 3> gradinput =
devicetensor<3>(state, gradinput_);
THCDeviceTensor<real, 1> gradgamma =
devicetensor<1>(state, gradgamma_);
THCDeviceTensor<real, 1> gradbeta = devicetensor<1>(state, gradbeta_);
THCDeviceTensor<real, 1> mean = devicetensor<1>(state, mean_);
THCDeviceTensor<real, 1> invstd = devicetensor<1>(state, invstd_);
THCDeviceTensor<real, 1> gamma = devicetensor<1>(state, gamma_);
THCDeviceTensor<real, 1> beta = devicetensor<1>(state, beta_);
THCDeviceTensor<real, 1> gradMean = devicetensor<1>(state, gradMean_);
THCDeviceTensor<real, 1> gradStd = devicetensor<1>(state, gradStd_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(BatchNorm_Backward_kernel)<<<blocks, threads, 0, stream>>>(
gradoutput, input, gradinput, gradgamma, gradbeta, mean, invstd,
gamma, beta, gradMean, gradStd, train);
THCudaCheck(cudaGetLastError());
}
struct Encoding_(SumOp) {
__device__ Encoding_(SumOp)(THCDeviceTensor<real, 3> i)
: input(i){}
__device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
real g = input[batch][plane][n].ldg();
return Encoding_(Float2)(g, g * g);
}
real mean;
THCDeviceTensor<real, 3> input;
};
// Sum across (batch, x/y/z) applying Op() pointwise
__device__ Encoding_(Float2) Encoding_(reduce_sum)(Encoding_(SumOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
Encoding_(Float2) sum = (Encoding_(Float2))0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = Encoding_(warpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ Encoding_(Float2) shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (Encoding_(Float2))0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(warpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
__global__ void Encoding_(Sum_Square_Forward_kernel) (
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> sum,
THCDeviceTensor<real, 1> square)
{
int c = blockIdx.x;
/* main operation */
Encoding_(SumOp) g(input);
Encoding_(Float2) res = Encoding_(reduce_sum)(g, input, c);
real xsum = res.v1;
real xsquare = res.v2;
if (threadIdx.x == 0) {
sum[c] = xsum;
square[c] = xsquare;
}
}
void Encoding_(Sum_Square_Forward)(THCState *state,
THCTensor *input_, THCTensor *sum_, THCTensor *square_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 3, input_, sum_, square_);
if (THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, sum_) != 1 ||
THCTensor_(nDimension)(state, square_) != 1)
THError("Sum_Square forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> sum = devicetensor<1>(state, sum_);
THCDeviceTensor<real, 1> square = devicetensor<1>(state, square_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(Sum_Square_Forward_kernel)<<<blocks, threads, 0, stream>>>(
input, sum, square);
THCudaCheck(cudaGetLastError());
}
__global__ void Encoding_(Sum_Square_Backward_kernel) (
THCDeviceTensor<real, 3> gradInput,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> gradSum,
THCDeviceTensor<real, 1> gradSquare)
{
int c = blockIdx.x;
/* main operation */
for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
{
gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
input[batch][c][x];
}
}
}
void Encoding_(Sum_Square_Backward)(THCState *state,
THCTensor *gradInput_, THCTensor *input_,
THCTensor *gradSum_, THCTensor *gradSquare_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, gradInput_, input_, gradSum_,
gradSquare_);
if (THCTensor_(nDimension)(state, gradInput_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, gradSum_) != 1 ||
THCTensor_(nDimension)(state, gradSquare_) != 1)
THError("Sum_Square forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> gradInput = devicetensor<3>(state, gradInput_);
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> gradSum = devicetensor<1>(state, gradSum_);
THCDeviceTensor<real, 1> gradSquare =devicetensor<1>(state, gradSquare_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(Sum_Square_Backward_kernel)<<<blocks, threads, 0, stream>>>(
gradInput, input, gradSum, gradSquare);
THCudaCheck(cudaGetLastError());
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/syncbn_kernel.h"
#else
void Encoding_(SquareSqueeze_Forward)(
THCState *state, THCTensor *L_, THCTensor *R_);
void Encoding_(SquareSqueeze_Backward)(
THCState *state, THCTensor *GL_, THCTensor *GR_, THCTensor *R_);
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_);
void Encoding_(BatchNorm_Backward)(THCState *state,
THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_,
THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_,
THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_,
THCTensor *gradMean_, THCTensor *gradStd_, int train);
void Encoding_(Sum_Square_Forward)(THCState *state,
THCTensor *input_, THCTensor *sum_, THCTensor *square_);
void Encoding_(Sum_Square_Backward)(THCState *state,
THCTensor *gradInput, THCTensor *input_,
THCTensor *gradSum_, THCTensor *gradSquare_);
#endif
......@@ -21,12 +21,26 @@
extern "C" {
#endif
// float
#include "generic/encoding_kernel.c"
#include "THC/THCGenerateFloatType.h"
#include "generic/syncbn_kernel.c"
#include "THC/THCGenerateFloatType.h"
#include "generic/pooling_kernel.c"
#include "THC/THCGenerateFloatType.h"
// double
#include "generic/encoding_kernel.c"
#include "THC/THCGenerateDoubleType.h"
#include "generic/syncbn_kernel.c"
#include "THC/THCGenerateDoubleType.h"
#include "generic/pooling_kernel.c"
#include "THC/THCGenerateDoubleType.h"
#ifdef __cplusplus
}
#endif
......@@ -23,12 +23,26 @@ extern THCState *state;
extern "C" {
#endif
// float
#include "generic/encoding_kernel.h"
#include "THC/THCGenerateFloatType.h"
#include "generic/syncbn_kernel.h"
#include "THC/THCGenerateFloatType.h"
#include "generic/pooling_kernel.h"
#include "THC/THCGenerateFloatType.h"
// double
#include "generic/encoding_kernel.h"
#include "THC/THCGenerateDoubleType.h"
#include "generic/syncbn_kernel.h"
#include "THC/THCGenerateDoubleType.h"
#include "generic/pooling_kernel.h"
#include "THC/THCGenerateDoubleType.h"
#ifdef __cplusplus
}
#endif
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
from .syncbn import *
from .basic import *
from .encoding import *
from .customize import *
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import math
import torch
from torch.autograd import Variable
from torch.nn import Module, Sequential
from torch.nn import functional as F
from torch.nn.parameter import Parameter
from torch.nn.modules.utils import _single, _pair, _triple
from ..parallel import my_data_parallel
from ..functions import view_each
__all__ = ['Module', 'Sequential', 'Conv1d', 'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d', 'AvgPool2d', 'AdaptiveAvgPool2d', 'Dropout2d', 'Linear']
class _ConvNd(Module):
def __init__(self, in_channels, out_channels, kernel_size, stride,
padding, dilation, transposed, output_padding, groups, bias):
super(_ConvNd, self).__init__()
if in_channels % groups != 0:
raise ValueError('in_channels must be divisible by groups')
if out_channels % groups != 0:
raise ValueError('out_channels must be divisible by groups')
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.dilation = dilation
self.transposed = transposed
self.output_padding = output_padding
self.groups = groups
if transposed:
self.weight = Parameter(torch.Tensor(
in_channels, out_channels // groups, *kernel_size))
else:
self.weight = Parameter(torch.Tensor(
out_channels, in_channels // groups, *kernel_size))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
n = self.in_channels
for k in self.kernel_size:
n *= k
stdv = 1. / math.sqrt(n)
self.weight.data.uniform_(-stdv, stdv)
if self.bias is not None:
self.bias.data.uniform_(-stdv, stdv)
def __repr__(self):
s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}'
', stride={stride}')
if self.padding != (0,) * len(self.padding):
s += ', padding={padding}'
if self.dilation != (1,) * len(self.dilation):
s += ', dilation={dilation}'
if self.output_padding != (0,) * len(self.output_padding):
s += ', output_padding={output_padding}'
if self.groups != 1:
s += ', groups={groups}'
if self.bias is None:
s += ', bias=False'
s += ')'
return s.format(name=self.__class__.__name__, **self.__dict__)
class Conv1d(_ConvNd):
r"""Applies a 1D convolution over an input signal composed of several input
planes.
In the simplest case, the output value of the layer with input size
:math:`(N, C_{in}, L)` and output :math:`(N, C_{out}, L_{out})` can be
precisely described as:
.. math::
\begin{array}{ll}
out(N_i, C_{out_j}) = bias(C_{out_j})
+ \sum_{{k}=0}^{C_{in}-1} weight(C_{out_j}, k) \star input(N_i, k)
\end{array}
where :math:`\star` is the valid `cross-correlation`_ operator
| :attr:`stride` controls the stride for the cross-correlation.
| If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points.
| :attr:`dilation` controls the spacing between the kernel points; also
known as the à trous algorithm. It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
| :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels` must both be divisible by `groups`.
| At groups=1, all inputs are convolved to all outputs.
| At groups=2, the operation becomes equivalent to having two conv layers side by side, each seeing half the input channels, and producing half the output channels, and both subsequently concatenated.
At groups=`in_channels`, each input channel is convolved with its own set of filters (of size `out_channels // in_channels`).
Args:
in_channels (int): Number of channels in the input image
out_channels (int): Number of channels produced by the convolution
kernel_size (int or tuple): Size of the convolving kernel
stride (int or tuple, optional): Stride of the convolution. Default: 1
padding (int or tuple, optional): Zero-padding added to both sides of
the input. Default: 0
dilation (int or tuple, optional): Spacing between kernel
elements. Default: 1
groups (int, optional): Number of blocked connections from input
channels to output channels. Default: 1
bias (bool, optional): If True, adds a learnable bias to the output. Default: True
Shape:
- Input: :math:`(N, C_{in}, L_{in})`
- Output: :math:`(N, C_{out}, L_{out})` where
:math:`L_{out} = floor((L_{in} + 2 * padding - dilation * (kernel\_size - 1) - 1) / stride + 1)`
Attributes:
weight (Tensor): the learnable weights of the module of shape
(out_channels, in_channels, kernel_size)
bias (Tensor): the learnable bias of the module of shape
(out_channels)
Examples::
>>> m = nn.Conv1d(16, 33, 3, stride=2)
>>> input = autograd.Variable(torch.randn(20, 16, 50))
>>> output = m(input)
.. _cross-correlation:
https://en.wikipedia.org/wiki/Cross-correlation
.. _link:
https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
"""
def __init__(self, in_channels, out_channels, kernel_size, stride=1,
padding=0, dilation=1, groups=1, bias=True):
kernel_size = _single(kernel_size)
stride = _single(stride)
padding = _single(padding)
dilation = _single(dilation)
super(Conv1d, self).__init__(
in_channels, out_channels, kernel_size, stride, padding, dilation,
False, _single(0), groups, bias)
def forward(self, input):
return F.conv1d(input, self.weight, self.bias, self.stride,
self.padding, self.dilation, self.groups)
class Conv2d(_ConvNd):
r"""Applies a 2D convolution over an input signal composed of several input
planes.
In the simplest case, the output value of the layer with input size
:math:`(N, C_{in}, H, W)` and output :math:`(N, C_{out}, H_{out}, W_{out})`
can be precisely described as:
.. math::
\begin{array}{ll}
out(N_i, C_{out_j}) = bias(C_{out_j})
+ \sum_{{k}=0}^{C_{in}-1} weight(C_{out_j}, k) \star input(N_i, k)
\end{array}
where :math:`\star` is the valid 2D `cross-correlation`_ operator
| :attr:`stride` controls the stride for the cross-correlation.
| If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points.
| :attr:`dilation` controls the spacing between the kernel points; also
known as the à trous algorithm. It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
| :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels` must both be divisible by `groups`.
| At groups=1, all inputs are convolved to all outputs.
| At groups=2, the operation becomes equivalent to having two conv layers side by side, each seeing half the input channels, and producing half the output channels, and both subsequently concatenated.
At groups=`in_channels`, each input channel is convolved with its own set of filters (of size `out_channels // in_channels`).
The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
- a single ``int`` -- in which case the same value is used for the height and width dimension
- a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
and the second `int` for the width dimension
Args:
in_channels (int): Number of channels in the input image
out_channels (int): Number of channels produced by the convolution
kernel_size (int or tuple): Size of the convolving kernel
stride (int or tuple, optional): Stride of the convolution. Default: 1
padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
bias (bool, optional): If True, adds a learnable bias to the output. Default: True
Shape:
- Input: :math:`(N, C_{in}, H_{in}, W_{in})`
- Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
:math:`H_{out} = floor((H_{in} + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)`
:math:`W_{out} = floor((W_{in} + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)`
Attributes:
weight (Tensor): the learnable weights of the module of shape
(out_channels, in_channels, kernel_size[0], kernel_size[1])
bias (Tensor): the learnable bias of the module of shape (out_channels)
Examples::
>>> # With square kernels and equal stride
>>> m = nn.Conv2d(16, 33, 3, stride=2)
>>> # non-square kernels and unequal stride and with padding
>>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
>>> # non-square kernels and unequal stride and with padding and dilation
>>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
>>> input = autograd.Variable(torch.randn(20, 16, 50, 100))
>>> output = m(input)
.. _cross-correlation:
https://en.wikipedia.org/wiki/Cross-correlation
.. _link:
https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
"""
def __init__(self, in_channels, out_channels, kernel_size, stride=1,
padding=0, dilation=1, groups=1, bias=True):
kernel_size = _pair(kernel_size)
stride = _pair(stride)
padding = _pair(padding)
dilation = _pair(dilation)
super(Conv2d, self).__init__(
in_channels, out_channels, kernel_size, stride, padding, dilation,
False, _pair(0), groups, bias)
def forward(self, input):
if isinstance(input, Variable):
return F.conv2d(input, self.weight, self.bias, self.stride,
self.padding, self.dilation, self.groups)
elif isinstance(input, tuple) or isinstance(input, list):
return my_data_parallel(self, input)
else:
raise RuntimeError('unknown input type')
class _ConvTransposeMixin(object):
def forward(self, input, output_size=None):
output_padding = self._output_padding(input, output_size)
func = self._backend.ConvNd(
self.stride, self.padding, self.dilation, self.transposed,
output_padding, self.groups)
if self.bias is None:
return func(input, self.weight)
else:
return func(input, self.weight, self.bias)
def _output_padding(self, input, output_size):
if output_size is None:
return self.output_padding
output_size = list(output_size)
k = input.dim() - 2
if len(output_size) == k + 2:
output_size = output_size[-2:]
if len(output_size) != k:
raise ValueError(
"output_size must have {} or {} elements (got {})"
.format(k, k + 2, len(output_size)))
def dim_size(d):
return ((input.size(d + 2) - 1) * self.stride[d] -
2 * self.padding[d] + self.kernel_size[d])
min_sizes = [dim_size(d) for d in range(k)]
max_sizes = [min_sizes[d] + self.stride[d] - 1 for d in range(k)]
for size, min_size, max_size in zip(output_size, min_sizes, max_sizes):
if size < min_size or size > max_size:
raise ValueError((
"requested an output size of {}, but valid sizes range "
"from {} to {} (for an input of {})").format(
output_size, min_sizes, max_sizes, input.size()[2:]))
return tuple([output_size[d] - min_sizes[d] for d in range(k)])
class ConvTranspose2d(_ConvTransposeMixin, _ConvNd):
r"""Applies a 2D transposed convolution operator over an input image
composed of several input planes.
This module can be seen as the gradient of Conv2d with respect to its input.
It is also known as a fractionally-strided convolution or
a deconvolution (although it is not an actual deconvolution operation).
| :attr:`stride` controls the stride for the cross-correlation.
| If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides for :attr:`padding` number of points.
| If :attr:`output_padding` is non-zero, then the output is implicitly zero-padded on one side for :attr:`output_padding` number of points.
| :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm. It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
| :attr:`groups` controls the connections between inputs and outputs. `in_channels` and `out_channels` must both be divisible by `groups`.
| At groups=1, all inputs are convolved to all outputs.
| At groups=2, the operation becomes equivalent to having two conv layers side by side, each seeing half the input channels, and producing half the output channels, and both subsequently concatenated.
At groups=`in_channels`, each input channel is convolved with its own set of filters (of size `out_channels // in_channels`).
The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding` can either be:
- a single ``int`` -- in which case the same value is used for the height and width dimensions
- a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
and the second `int` for the width dimension
Args:
in_channels (int): Number of channels in the input image
out_channels (int): Number of channels produced by the convolution
kernel_size (int or tuple): Size of the convolving kernel
stride (int or tuple, optional): Stride of the convolution. Default: 1
padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
output_padding (int or tuple, optional): Zero-padding added to one side of the output. Default: 0
groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
bias (bool, optional): If True, adds a learnable bias to the output. Default: True
dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
Shape:
- Input: :math:`(N, C_{in}, H_{in}, W_{in})`
- Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
:math:`H_{out} = (H_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0] + output\_padding[0]`
:math:`W_{out} = (W_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1] + output\_padding[1]`
Attributes:
weight (Tensor): the learnable weights of the module of shape
(in_channels, out_channels, kernel_size[0], kernel_size[1])
bias (Tensor): the learnable bias of the module of shape (out_channels)
Examples::
>>> # With square kernels and equal stride
>>> m = nn.ConvTranspose2d(16, 33, 3, stride=2)
>>> # non-square kernels and unequal stride and with padding
>>> m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
>>> input = autograd.Variable(torch.randn(20, 16, 50, 100))
>>> output = m(input)
>>> # exact output size can be also specified as an argument
>>> input = autograd.Variable(torch.randn(1, 16, 12, 12))
>>> downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)
>>> upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
>>> h = downsample(input)
>>> h.size()
torch.Size([1, 16, 6, 6])
>>> output = upsample(h, output_size=input.size())
>>> output.size()
torch.Size([1, 16, 12, 12])
.. _cross-correlation:
https://en.wikipedia.org/wiki/Cross-correlation
.. _link:
https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
"""
def __init__(self, in_channels, out_channels, kernel_size, stride=1,
padding=0, output_padding=0, groups=1, bias=True,
dilation=1):
kernel_size = _pair(kernel_size)
stride = _pair(stride)
padding = _pair(padding)
dilation = _pair(dilation)
output_padding = _pair(output_padding)
super(ConvTranspose2d, self).__init__(
in_channels, out_channels, kernel_size, stride, padding, dilation,
True, output_padding, groups, bias)
def forward(self, input, output_size=None):
output_padding = self._output_padding(input, output_size)
if isinstance(input, Variable):
return F.conv_transpose2d(
input, self.weight, self.bias, self.stride, self.padding,
output_padding, self.groups, self.dilation)
elif isinstance(input, tuple) or isinstance(input, list):
return my_data_parallel(self, input)
else:
raise RuntimeError('unknown input type')
class Threshold(Module):
def __init__(self, threshold, value, inplace=False):
super(Threshold, self).__init__()
self.threshold = threshold
self.value = value
self.inplace = inplace
def forward(self, input):
if isinstance(input, Variable):
return F.threshold(input, self.threshold, self.value,
self.inplace)
elif isinstance(input, tuple) or isinstance(input, list):
return my_data_parallel(self, input)
else:
raise RuntimeError('unknown input type')
def __repr__(self):
inplace_str = ', inplace' if self.inplace else ''
return self.__class__.__name__ + ' (' \
+ str(self.threshold) \
+ ', ' + str(self.value) \
+ inplace_str + ')'
class ReLU(Threshold):
"""Applies the rectified linear unit function element-wise
:math:`{ReLU}(x)= max(0, x)`
Args:
inplace: can optionally do the operation in-place. Default: False
Shape:
- Input: :math:`(N, *)` where `*` means, any number of additional
dimensions
- Output: :math:`(N, *)`, same shape as the input
Examples::
>>> m = nn.ReLU()
>>> input = autograd.Variable(torch.randn(2))
>>> print(input)
>>> print(m(input))
"""
def __init__(self, inplace=False):
super(ReLU, self).__init__(0, 0, inplace)
def __repr__(self):
inplace_str = 'inplace' if self.inplace else ''
return self.__class__.__name__ + ' (' \
+ inplace_str + ')'
class Sigmoid(Module):
"""Applies the element-wise function :math:`f(x) = 1 / ( 1 + exp(-x))`
Shape:
- Input: :math:`(N, *)` where `*` means, any number of additional
dimensions
- Output: :math:`(N, *)`, same shape as the input
Examples::
>>> m = nn.Sigmoid()
>>> input = autograd.Variable(torch.randn(2))
>>> print(input)
>>> print(m(input))
"""
def forward(self, input):
if isinstance(input, Variable):
return torch.sigmoid(input)
elif isinstance(input, tuple) or isinstance(input, list):
return my_data_parallel(self, input)
else:
raise RuntimeError('unknown input type')
def __repr__(self):
return self.__class__.__name__ + ' ()'
class MaxPool2d(Module):
r"""Applies a 2D max pooling over an input signal composed of several input
planes.
In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
can be precisely described as:
.. math::
\begin{array}{ll}
out(N_i, C_j, h, w) = \max_{{m}=0}^{kH-1} \max_{{n}=0}^{kW-1}
input(N_i, C_j, stride[0] * h + m, stride[1] * w + n)
\end{array}
| If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
for :attr:`padding` number of points
| :attr:`dilation` controls the spacing between the kernel points. It is harder to describe,
but this `link`_ has a nice visualization of what :attr:`dilation` does.
The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
- a single ``int`` -- in which case the same value is used for the height and width dimension
- a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
and the second `int` for the width dimension
Args:
kernel_size: the size of the window to take a max over
stride: the stride of the window. Default value is :attr:`kernel_size`
padding: implicit zero padding to be added on both sides
dilation: a parameter that controls the stride of elements in the window
return_indices: if True, will return the max indices along with the outputs.
Useful when Unpooling later
ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
Shape:
- Input: :math:`(N, C, H_{in}, W_{in})`
- Output: :math:`(N, C, H_{out}, W_{out})` where
:math:`H_{out} = floor((H_{in} + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)`
:math:`W_{out} = floor((W_{in} + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)`
Examples::
>>> # pool of square window of size=3, stride=2
>>> m = nn.MaxPool2d(3, stride=2)
>>> # pool of non-square window
>>> m = nn.MaxPool2d((3, 2), stride=(2, 1))
>>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
>>> output = m(input)
.. _link:
https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
"""
def __init__(self, kernel_size, stride=None, padding=0, dilation=1,
return_indices=False, ceil_mode=False):
super(MaxPool2d, self).__init__()
self.kernel_size = kernel_size
self.stride = stride or kernel_size
self.padding = padding
self.dilation = dilation
self.return_indices = return_indices
self.ceil_mode = ceil_mode
def forward(self, input):
if isinstance(input, Variable):
return F.max_pool2d(input, self.kernel_size, self.stride, \
self.padding, self.dilation, self.ceil_mode, \
self.return_indices)
elif isinstance(input, tuple) or isinstance(input, list):
return my_data_parallel(self, input)
else:
raise RuntimeError('unknown input type')
def __repr__(self):
kh, kw = _pair(self.kernel_size)
dh, dw = _pair(self.stride)
padh, padw = _pair(self.padding)
dilh, dilw = _pair(self.dilation)
padding_str = ', padding=(' + str(padh) + ', ' + str(padw) + ')' \
if padh != 0 and padw != 0 else ''
dilation_str = (', dilation=(' + str(dilh) + ', ' + str(dilw) + ')'
if dilh != 0 and dilw != 0 else '')
return self.__class__.__name__ + ' (' \
+ 'size=(' + str(kh) + ', ' + str(kw) + ')' \
+ ', stride=(' + str(dh) + ', ' + str(dw) + ')' \
+ padding_str + dilation_str + ')'
class AvgPool2d(Module):
r"""Applies a 2D average pooling over an input signal composed of several input
planes.
In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
can be precisely described as:
.. math::
\begin{array}{ll}
out(N_i, C_j, h, w) = 1 / (kH * kW) * \sum_{{m}=0}^{kH-1} \sum_{{n}=0}^{kW-1}
input(N_i, C_j, stride[0] * h + m, stride[1] * w + n)
\end{array}
| If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
for :attr:`padding` number of points
The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be:
- a single ``int`` -- in which case the same value is used for the height and width dimension
- a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
and the second `int` for the width dimension
Args:
kernel_size: the size of the window
stride: the stride of the window. Default value is :attr:`kernel_size`
padding: implicit zero padding to be added on both sides
ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
count_include_pad: when True, will include the zero-padding in the averaging calculation
Shape:
- Input: :math:`(N, C, H_{in}, W_{in})`
- Output: :math:`(N, C, H_{out}, W_{out})` where
:math:`H_{out} = floor((H_{in} + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
:math:`W_{out} = floor((W_{in} + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
Examples::
>>> # pool of square window of size=3, stride=2
>>> m = nn.AvgPool2d(3, stride=2)
>>> # pool of non-square window
>>> m = nn.AvgPool2d((3, 2), stride=(2, 1))
>>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
>>> output = m(input)
"""
def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
count_include_pad=True):
super(AvgPool2d, self).__init__()
self.kernel_size = kernel_size
self.stride = stride or kernel_size
self.padding = padding
self.ceil_mode = ceil_mode
self.count_include_pad = count_include_pad
def forward(self, input):
if isinstance(input, Variable):
return F.avg_pool2d(input, self.kernel_size, self.stride,
self.padding, self.ceil_mode, self.count_include_pad)
elif isinstance(input, tuple) or isinstance(input, list):
return my_data_parallel(self, input)
else:
raise RuntimeError('unknown input type')
class AdaptiveAvgPool2d(Module):
"""Applies a 2D adaptive average pooling over an input signal composed of several input planes.
The output is of size H x W, for any input size.
The number of output features is equal to the number of input planes.
Args:
output_size: the target output size of the image of the form H x W.
Can be a tuple (H, W) or a single number H for a square image H x H
Examples:
>>> # target output size of 5x7
>>> m = nn.AdaptiveAvgPool2d((5,7))
>>> input = autograd.Variable(torch.randn(1, 64, 8, 9))
>>> output = m(input)
>>> # target output size of 7x7 (square)
>>> m = nn.AdaptiveAvgPool2d(7)
>>> input = autograd.Variable(torch.randn(1, 64, 10, 9))
>>> output = m(input)
"""
def __init__(self, output_size):
super(AdaptiveAvgPool2d, self).__init__()
self.output_size = output_size
def forward(self, input):
if isinstance(input, Variable):
return F.adaptive_avg_pool2d(input, self.output_size)
elif isinstance(input, tuple) or isinstance(input, list):
return my_data_parallel(self, input)
else:
raise RuntimeError('unknown input type')
def __repr__(self):
return self.__class__.__name__ + ' (' \
+ 'output_size=' + str(self.output_size) + ')'
class Dropout2d(Module):
r"""Randomly zeroes whole channels of the input tensor.
The channels to zero-out are randomized on every forward call.
*Usually the input comes from Conv2d modules.*
As described in the paper
`Efficient Object Localization Using Convolutional Networks`_ ,
if adjacent pixels within feature maps are strongly correlated
(as is normally the case in early convolution layers) then iid dropout
will not regularize the activations and will otherwise just result
in an effective learning rate decrease.
In this case, :func:`nn.Dropout2d` will help promote independence between
feature maps and should be used instead.
Args:
p (float, optional): probability of an element to be zeroed.
inplace (bool, optional): If set to True, will do this operation
in-place
Shape:
- Input: :math:`(N, C, H, W)`
- Output: :math:`(N, C, H, W)` (same shape as input)
Examples::
>>> m = nn.Dropout2d(p=0.2)
>>> input = autograd.Variable(torch.randn(20, 16, 32, 32))
>>> output = m(input)
.. _Efficient Object Localization Using Convolutional Networks:
http://arxiv.org/abs/1411.4280
"""
def __init__(self, p=0.5, inplace=False):
super(Dropout2d, self).__init__()
if p < 0 or p > 1:
raise ValueError("dropout probability has to be between 0 and 1, "
"but got {}".format(p))
self.p = p
self.inplace = inplace
self.drop = torch.nn.Dropout2d(p=p, inplace=inplace)
def forward(self, input):
if isinstance(input, Variable):
return self.drop(input)
elif isinstance(input, tuple) or isinstance(input, list):
return my_data_parallel(self.drop, input)
else:
raise RuntimeError('unknown input type')
def __repr__(self):
inplace_str = ', inplace' if self.inplace else ''
return self.__class__.__name__ + ' (' \
+ 'p=' + str(self.p) \
+ inplace_str + ')'
class Linear(Module):
r"""Applies a linear transformation to the incoming data: :math:`y = Ax + b`
Args:
in_features: size of each input sample
out_features: size of each output sample
bias: If set to False, the layer will not learn an additive bias.
Default: True
Shape:
- Input: :math:`(N, *, in\_features)` where `*` means any number of
additional dimensions
- Output: :math:`(N, *, out\_features)` where all but the last dimension
are the same shape as the input.
Attributes:
weight: the learnable weights of the module of shape
(out_features x in_features)
bias: the learnable bias of the module of shape (out_features)
Examples::
>>> m = nn.Linear(20, 30)
>>> input = autograd.Variable(torch.randn(128, 20))
>>> output = m(input)
>>> print(output.size())
"""
def __init__(self, in_features, out_features, bias=True):
super(Linear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = Parameter(torch.Tensor(out_features, in_features))
if bias:
self.bias = Parameter(torch.Tensor(out_features))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
stdv = 1. / math.sqrt(self.weight.size(1))
self.weight.data.uniform_(-stdv, stdv)
if self.bias is not None:
self.bias.data.uniform_(-stdv, stdv)
def forward(self, input):
if isinstance(input, Variable):
return F.linear(input, self.weight, self.bias)
elif isinstance(input, tuple) or isinstance(input, list):
return my_data_parallel(self, input)
else:
raise RuntimeError('unknown input type')
def __repr__(self):
return self.__class__.__name__ + ' (' \
+ str(self.in_features) + ' -> ' \
+ str(self.out_features) + ')'
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import math
import torch
from torch.autograd import Variable
from torch.nn import Module
from torch.nn import functional as F
from torch.nn.parameter import Parameter
from ..parallel import my_data_parallel
from .syncbn import BatchNorm2d
from ..functions import dilatedavgpool2d
__all__ = ['DilatedAvgPool2d', 'MyConvTranspose2d', 'View', 'Normalize',
'Bottleneck']
class DilatedAvgPool2d(Module):
r"""We provide Dilated Average Pooling for the dilation of Densenet as
in :class:`encoding.dilated.DenseNet`.
Applies a 2D average pooling over an input signal composed of several input planes.
In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
can be precisely described as:
.. math::
\begin{array}{ll}
out(b, c, h, w) = 1 / (kH * kW) *
\sum_{{m}=0}^{kH-1} \sum_{{n}=0}^{kW-1}
input(b, c, dH * h + m, dW * w + n)
\end{array}
| If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
for :attr:`padding` number of points
The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
- a single ``int`` -- in which case the same value is used for the height and width dimension
- a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
and the second `int` for the width dimension
Args:
kernel_size: the size of the window
stride: the stride of the window. Default value is :attr:`kernel_size`
padding: implicit zero padding to be added on both sides
dilation: the dilation parameter similar to Conv2d
Shape:
- Input: :math:`(N, C, H_{in}, W_{in})`
- Output: :math:`(N, C, H_{out}, W_{out})` where
:math:`H_{out} = floor((H_{in} + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
:math:`W_{out} = floor((W_{in} + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
Examples::
>>> # pool of square window of size=3, stride=2, dilation=2
>>> m = nn.DilatedAvgPool2d(3, stride=2, dilation=2)
>>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
>>> output = m(input)
Reference::
comming
"""
def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
super(DilatedAvgPool2d, self).__init__()
self.kernel_size = kernel_size
self.stride = stride or kernel_size
self.padding = padding
self.dilation = dilation
def forward(self, input):
return dilatedavgpool2d(input, self.kernel_size, self.stride,
self.padding, self.dilation)
def __repr__(self):
return self.__class__.__name__ + ' (' \
+ 'size=' + str(self.kernel_size) \
+ ', stride=' + str(self.stride) \
+ ', padding=' + str(self.padding) \
+ ', dilation=' + str(self.dilation) + ')'
class MyConvTranspose2d(Module):
"""Customized Layers, discuss later
"""
def __init__(self, in_channels, out_channels, kernel_size, stride=1,
padding=0, dilation=1, groups=1, scale_factor =1,
bias=True):
super(MyConvTranspose2d, self).__init__()
kernel_size = _pair(kernel_size)
stride = _pair(stride)
padding = _pair(padding)
dilation = _pair(dilation)
if in_channels % groups != 0:
raise ValueError('in_channels must be divisible by groups')
if out_channels % groups != 0:
raise ValueError('out_channels must be divisible by groups')
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.dilation = dilation
self.groups = groups
self.scale_factor = scale_factor
self.weight = Parameter(torch.Tensor(
out_channels * scale_factor * scale_factor,
in_channels // groups, *kernel_size))
if bias:
self.bias = Parameter(torch.Tensor(out_channels *
scale_factor * scale_factor))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
n = self.in_channels
for k in self.kernel_size:
n *= k
stdv = 1. / math.sqrt(n)
self.weight.data.uniform_(-stdv, stdv)
if self.bias is not None:
self.bias.data.uniform_(-stdv, stdv)
def forward(self, input):
if isinstance(input, Variable):
out = F.conv2d(input, self.weight, self.bias, self.stride,
self.padding, self.dilation, self.groups)
return F.pixel_shuffle(out, self.scale_factor)
elif isinstance(input, tuple) or isinstance(input, list):
return my_data_parallel(self, input)
else:
raise RuntimeError('unknown input type')
class View(Module):
"""Reshape the input into different size, an inplace operator, support
SelfParallel mode.
"""
def __init__(self, *args):
super(View, self).__init__()
if len(args) == 1 and isinstance(args[0], torch.Size):
self.size = args[0]
else:
self.size = torch.Size(args)
def forward(self, input):
if isinstance(input, Variable):
return input.view(self.size)
elif isinstance(input, tuple) or isinstance(input, list):
return view_each(input, self.size)
else:
raise RuntimeError('unknown input type')
class Normalize(Module):
r"""Performs :math:`L_p` normalization of inputs over specified dimension.
Does:
.. math::
v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}
for each subtensor v over dimension dim of input. Each subtensor is
flattened into a vector, i.e. :math:`\lVert v \rVert_p` is not a matrix
norm.
With default arguments normalizes over the second dimension with Euclidean
norm.
Args:
p (float): the exponent value in the norm formulation. Default: 2
dim (int): the dimension to reduce. Default: 1
"""
def __init__(self, p=2, dim=1):
super(Normalize, self).__init__()
self.p = p
self.dim =dim
def forward(self, x):
if isinstance(x, Variable):
return F.normalize(x, self.p, self.dim)
elif isinstance(x, tuple) or isinstance(x, list):
return my_data_parallel(self, x)
else:
raise RuntimeError('unknown input type')
class Bottleneck(Module):
""" Pre-activation residual block
Identity Mapping in Deep Residual Networks
ref https://arxiv.org/abs/1603.05027
"""
def __init__(self, inplanes, planes, stride=1,
norm_layer=BatchNorm2d):
super(Bottleneck, self).__init__()
self.expansion = 4
if inplanes != planes*self.expansion or stride !=1 :
self.downsample = True
self.residual_layer = Conv2d(inplanes, planes * self.expansion,
kernel_size=1, stride=stride)
else:
self.downsample = False
conv_block = []
conv_block += [norm_layer(inplanes),
ReLU(inplace=True),
Conv2d(inplanes, planes, kernel_size=1, stride=1)]
conv_block += [norm_layer(planes),
ReLU(inplace=True),
Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1)]
conv_block += [norm_layer(planes),
ReLU(inplace=True),
Conv2d(planes, planes * self.expansion, kernel_size=1,
stride=1)]
self.conv_block = Sequential(*conv_block)
def forward(self, x):
if self.downsample:
residual = self.residual_layer(x)
else:
residual = x
if isinstance(x, Variable):
return residual + self.conv_block(x)
elif isinstance(x, tuple) or isinstance(x, list):
return sum_each(residual, self.conv_block(x))
else:
raise RuntimeError('unknown input type')
def _get_a_var(obj):
if isinstance(obj, Variable):
return obj
if isinstance(obj, list) or isinstance(obj, tuple):
results = map(_get_a_var, obj)
for result in results:
if isinstance(result, Variable):
return result
if isinstance(obj, dict):
results = map(_get_a_var, obj.items())
for result in results:
if isinstance(result, Variable):
return result
return None
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import threading
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function, Variable
from .._ext import encoding_lib
from ..functions import scaledL2, aggregate, aggregateP, residual, assign
from ..parallel import my_data_parallel
__all__ = ['Encoding', 'Inspiration', 'GramMatrix', 'Aggregate','EncodingP']
class Encoding(nn.Module):
r"""
Encoding Layer: a learnable residual encoder over 3d or 4d input that
is seen as a mini-batch.
.. image:: http://hangzh.com/figure/cvpr17.svg
:width: 50%
:align: center
.. math::
e_{ik} = \frac{exp(-s_k\|x_{i}-c_k\|^2)}{\sum_{j=1}^K exp(-s_j\|x_{i}-c_j\|^2)} (x_i - c_k)
Please see the `example of training Deep TEN <./experiments/texture.html>`_.
Args:
D: dimention of the features or feature channels
K: number of codeswords
Shape:
- Input: :math:`X\in\mathcal{R}^{B\times N\times D}` or :math:`\mathcal{R}^{B\times D\times H\times W}` (where :math:`B` is batch, :math:`N` is total number of features or :math:`H\times W`.)
- Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
Attributes:
codewords (Tensor): the learnable codewords of shape (:math:`K\times D`)
scale (Tensor): the learnable scale factor of visual centers
Examples:
>>> import encoding
>>> import torch
>>> import torch.nn.functional as F
>>> from torch.autograd import Variable, gradcheck
>>> B,C,H,W,K = 2,3,4,5,6
>>> X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5), requires_grad=True)
>>> layer = encoding.Encoding(C,K).double().cuda()
>>> E = layer(X)
Reference:
Hang Zhang, Jia Xue, and Kristin Dana. "Deep TEN: Texture Encoding Network." *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2017*
"""
def __init__(self, D, K):
super(Encoding, self).__init__()
# init codewords and smoothing factor
self.D, self.K = D, K
self.codewords = nn.Parameter(torch.Tensor(K, D),
requires_grad=True)
self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True)
self.reset_params()
def reset_params(self):
std1 = 1./((self.K*self.D)**(1/2))
std2 = 1./((self.K)**(1/2))
self.codewords.data.uniform_(-std1, std1)
self.scale.data.uniform_(-std2, std2)
def forward(self, X):
if isinstance(X, tuple) or isinstance(X, list):
# for self-parallel mode, please see encoding.nn
return my_data_parallel(self, X)
elif not isinstance(X, Variable):
raise RuntimeError('unknown input type')
# input X is a 4D tensor
assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
if X.dim() == 3:
# BxDxN
B, N, K, D = X.size(0), X.size(2), self.K, self.D
X = X.transpose(1,2).contiguous()
elif X.dim() == 4:
# BxDxHxW
B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
X = X.view(B,D,-1).transpose(1,2).contiguous()
else:
raise RuntimeError('Encoding Layer unknown input dims!')
# assignment weights
A = F.softmax(scaledL2(X, self.codewords, self.scale))
# aggregate
E = aggregate(A, X, self.codewords)
return E
def __repr__(self):
return self.__class__.__name__ + '(' \
+ 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+ str(self.D) + ')'
class Inspiration(nn.Module):
r""" Inspiration Layer (for MSG-Net).
Tuning the featuremap with target Gram Matrix
.. math::
Y = \phi^{-1}[\phi(\mathcal{F}^T)W\mathcal{G}]
Please see the `example of MSG-Net <./experiments/style.html>`_
training multi-style generative network for real-time transfer.
Reference:
Hang Zhang, and Kristin Dana. "Multi-style Generative Network for Real-time Transfer." *arXiv preprint arXiv:1703.06953 (2017)*
"""
def __init__(self, C, B=1):
super(Inspiration, self).__init__()
# B is equal to 1 or input mini_batch
self.weight = nn.Parameter(torch.Tensor(1,C,C), requires_grad=True)
# non-parameter buffer
self.G = Variable(torch.Tensor(B,C,C), requires_grad=True)
self.C = C
self.reset_parameters()
def reset_parameters(self):
self.weight.data.uniform_(0.0, 0.02)
def setTarget(self, target):
self.G = target
def forward(self, X):
# input X is a 3D feature map
self.P = torch.bmm(self.weight.expand_as(self.G),self.G)
return torch.bmm(self.P.transpose(1,2).expand(X.size(0), self.C, self.C), X.view(X.size(0),X.size(1),-1)).view_as(X)
def __repr__(self):
return self.__class__.__name__ + '(' \
+ 'N x ' + str(self.C) + ')'
class GramMatrix(nn.Module):
r""" Gram Matrix for a 4D convolutional featuremaps as a mini-batch
.. math::
\mathcal{G} = \sum_{h=1}^{H_i}\sum_{w=1}^{W_i} \mathcal{F}_{h,w}\mathcal{F}_{h,w}^T
"""
def forward(self, y):
(b, ch, h, w) = y.size()
features = y.view(b, ch, w * h)
features_t = features.transpose(1, 2)
gram = features.bmm(features_t) / (ch * h * w)
return gram
class Aggregate(nn.Module):
r"""
Aggregate operation, aggregate the residuals (:math:`R`) with
assignment weights (:math:`A`).
.. math::
e_{k} = \sum_{i=1}^{N} a_{ik} r_{ik}
Shape:
- Input: :math:`A\in\mathcal{R}^{B\times N\times K}` :math:`R\in\mathcal{R}^{B\times N\times K\times D}` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
- Output: :math:`E\in\mathcal{R}^{B\times K\times D}`
"""
def forward(self, A, R):
if isinstance(A, tuple) or isinstance(A, list):
# for self-parallel mode, please see encoding.nn
return my_data_parallel(self, A, R)
elif not isinstance(A, Variable):
raise RuntimeError('unknown input type')
return aggregateP(A, R)
class EncodingP(nn.Module):
def __init__(self, D, K):
super(EncodingP, self).__init__()
# init codewords and smoothing factor
self.D, self.K = D, K
self.codewords = nn.Parameter(torch.Tensor(K, D),
requires_grad=True)
self.scale = nn.Parameter(torch.Tensor(K), requires_grad=True)
self.reset_params()
print('EncodingP is deprecated, please use Encoding.')
def reset_params(self):
std1 = 1./((self.K*self.D)**(1/2))
std2 = 1./((self.K)**(1/2))
self.codewords.data.uniform_(-std1, std1)
self.scale.data.uniform_(-std2, std2)
def forward(self, X):
if isinstance(X, tuple) or isinstance(X, list):
# for self-parallel mode, please see encoding.nn
return my_data_parallel(self, X)
elif not isinstance(X, Variable):
raise RuntimeError('unknown input type')
# input X is a 4D tensor
assert(X.size(1)==self.D,"Encoding Layer wrong channels!")
if X.dim() == 3:
# BxDxN
B, N, K, D = X.size(0), X.size(2), self.K, self.D
X = X.transpose(1,2)
elif X.dim() == 4:
# BxDxHxW
B, N, K, D = X.size(0), X.size(2)*X.size(3), self.K, self.D
X = X.view(B,D,-1).transpose(1,2)
else:
raise RuntimeError('Encoding Layer unknown input dims!')
# calculate residuals
R = residual(X.contiguous(), self.codewords)
# assignment weights
A = assign(R, self.scale)
# aggregate
E = aggregateP(A, R)
return E
def __repr__(self):
return self.__class__.__name__ + '(' \
+ 'N x ' + str(self.D) + '=>' + str(self.K) + 'x' \
+ str(self.D) + ')'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment