Commit 2d21747a authored by Zhang's avatar Zhang
Browse files

v0.4.2

parent 7e19143c
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
import os
import sys
import numpy as np
import random
import math
import torch.utils.data as data
from PIL import Image, ImageOps
import torch.utils.data as data
import torchvision.transforms as transform
from .dataset import ToLabel
class FolderLoader(data.Dataset):
def __init__(self, root, transform=None):
self.root = root
self.transform = transform
self.images = get_folder_images(root)
if len(self.images) == 0:
raise(RuntimeError("Found 0 images in subfolders of: \
" + self.root + "\n"))
def __getitem__(self, index):
img = Image.open(self.images[index]).convert('RGB')
if self.transform is not None:
img = self.transform(img)
return img, os.path.basename(self.images[index])
def __len__(self):
return len(self.images)
def get_folder_images(img_folder):
img_paths = []
for filename in os.listdir(img_folder):
if filename.endswith(".jpg"):
imgpath = os.path.join(img_folder, filename)
img_paths.append(imgpath)
return img_paths
class Dataloder():
def __init__(self, args):
# the data augmentation is implemented as part of the dataloader
assert(args.test)
input_transform = transform.Compose([
transform.ToTensor(),
transform.Normalize(args.mean, args.std)])
args.test_batch_size = 1
assert(args.test_folder is not None)
print('loading the data from: {}'.format(args.test_folder))
testset = FolderLoader(args.test_folder, input_transform)
kwargs = {'num_workers': args.workers, 'pin_memory': True} \
if args.cuda else {}
self.trainloader = None
self.testloader = data.DataLoader(testset,
batch_size=args.test_batch_size,
shuffle=False, **kwargs)
def getloader(self):
return self.trainloader, self.testloader
# created by: Sean Liu
# Amazon Lab 126
from __future__ import print_function
import errno
import hashlib
import os
import sys
import tarfile
import numpy as np
import random
import math
import torch.utils.data as data
import PIL
from PIL import Image, ImageOps
from six.moves import urllib
class Segmentation_HPW18(data.Dataset):
CLASSES = [
'background', 'hat', 'hair', 'sunglasses', 'upper-clothes',
'skirt', 'pants', 'dress', 'belt', 'left-shoe', 'right-shoe',
'face', 'left-leg', 'right-leg', 'left-arm', 'right-arm', 'bag',
'scarf'
]
URL = "/cvdata1/lliuqian/humanParsingDataset"
FILE = "hpw18.tar.gz"
MD5 = ''
BASE_DIR = ''
def __init__(self,
root,
train=True,
transform=None,
target_transform=None,
download=False):
self.root = root
_hpw18_root = os.path.join(self.root, self.BASE_DIR)
_mask_dir = os.path.join(_hpw18_root, 'SegmentationClassAug_256x384')
_image_dir = os.path.join(_hpw18_root, 'JPEGImages_256x384')
self.transform = transform
self.target_transform = target_transform
self.train = train
if download:
self._download()
# train/val/test splits are pre-cut
_splits_dir = _hpw18_root
_split_f = os.path.join(_splits_dir, 'humanparsingImageMask_256x384_absPath_train.txt')
if not self.train:
_split_f = os.path.join(_splits_dir, 'humanparsingImageMask_256x384_absPath_val.txt')
print("reading from ", _split_f)
self.images = []
self.masks = []
with open(os.path.join(_split_f), "r") as lines:
for line in lines:
s = line.split()
_image = s[0] # image absolution path
_mask = s[1] # mask absolution path
assert os.path.isfile(_image)
assert os.path.isfile(_mask)
self.images.append(_image)
self.masks.append(_mask)
assert (len(self.images) == len(self.masks))
def __getitem__(self, index):
_img = Image.open(self.images[index]).convert('RGB')
_timg = Image.open(self.masks[index])
_target = np.array(_timg, dtype=np.uint8)
_target = Image.fromarray(_target)
# synchrosized transform
if self.train:
_img, _target = self._sync_transform( _img, _target)
# general resize, normalize and toTensor
if self.transform is not None:
_img = self.transform(_img)
if self.target_transform is not None:
_target = self.target_transform(_target)
return _img, _target
def __len__(self):
return len(self.images)
def _sync_transform(self, img, mask):
# random rotate -10~10
deg = random.uniform(-10,10)
img = img.rotate(deg)
mask = mask.rotate(deg, PIL.Image.NEAREST)
return img, mask
if __name__ == '__main__':
hpw18 = Segmentation_HPW18('/cvdata1/lliuqian/', train=True)
print(hpw18[0])
print (len(hpw18))
import os
import random
import scipy.io
import numpy as np
from PIL import Image, ImageOps, ImageFilter
from .base import BaseDataset
class VOCAugSegmentation(BaseDataset):
voc = [
'background', 'airplane', 'bicycle', 'bird', 'boat', 'bottle',
'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
'motorcycle', 'person', 'potted-plant', 'sheep', 'sofa', 'train',
'tv'
]
NUM_CLASS = 21
TRAIN_BASE_DIR = 'VOCaug/dataset/'
def __init__(self, root, split='train', mode=None, transform=None,
target_transform=None):
super(VOCAugSegmentation, self).__init__(root, split, mode, transform, target_transform)
# train/val/test splits are pre-cut
_voc_root = os.path.join(root, self.TRAIN_BASE_DIR)
_mask_dir = os.path.join(_voc_root, 'cls')
_image_dir = os.path.join(_voc_root, 'img')
if self.mode == 'train':
_split_f = os.path.join(_voc_root, 'trainval.txt')
elif self.mode == 'val':
_split_f = os.path.join(_voc_root, 'val.txt')
else:
raise RuntimeError('Unknown dataset split.')
self.images = []
self.masks = []
with open(os.path.join(_split_f), "r") as lines:
for line in lines:
_image = os.path.join(_image_dir, line.rstrip('\n')+".jpg")
assert os.path.isfile(_image)
self.images.append(_image)
if self.mode != 'test':
_mask = os.path.join(_mask_dir, line.rstrip('\n')+".mat")
assert os.path.isfile(_mask)
self.masks.append(_mask)
assert (len(self.images) == len(self.masks))
def __getitem__(self, index):
_img = Image.open(self.images[index]).convert('RGB')
if self.mode == 'test':
if self.transform is not None:
_img = self.transform(_img)
return _img, os.path.basename(self.images[index])
_target = self._load_mat(self.masks[index])
# synchrosized transform
if self.mode == 'train':
_img, _target = self._sync_transform( _img, _target)
elif self.mode == 'val':
_img, _target = self._val_sync_transform( _img, _target)
# general resize, normalize and toTensor
if self.transform is not None:
_img = self.transform(_img)
if self.target_transform is not None:
_target = self.target_transform(_target)
return _img, _target
def _load_mat(self, filename):
mat = scipy.io.loadmat(filename, mat_dtype=True, squeeze_me=True,
struct_as_record=False)
mask = mat['GTcls'].Segmentation
return Image.fromarray(mask)
def __len__(self):
return len(self.images)
import os
import random
import numpy as np
from PIL import Image, ImageOps, ImageFilter
from tqdm import tqdm
import torch
from .base import BaseDataset
class VOCSegmentation(BaseDataset):
CLASSES = [
'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'potted-plant', 'sheep', 'sofa', 'train',
'tv/monitor', 'ambigious'
]
NUM_CLASS = 21
BASE_DIR = 'VOCdevkit/VOC2012'
def __init__(self, root, split='train', mode=None, transform=None,
target_transform=None):
super(VOCSegmentation, self).__init__(root, split, mode, transform, target_transform)
_voc_root = os.path.join(self.root, self.BASE_DIR)
_mask_dir = os.path.join(_voc_root, 'SegmentationClass')
_image_dir = os.path.join(_voc_root, 'JPEGImages')
# train/val/test splits are pre-cut
_splits_dir = os.path.join(_voc_root, 'ImageSets/Segmentation')
if self.mode == 'train':
_split_f = os.path.join(_splits_dir, 'trainval.txt')
elif self.mode == 'val':
_split_f = os.path.join(_splits_dir, 'val.txt')
elif self.mode == 'test':
_split_f = os.path.join(_splits_dir, 'test.txt')
else:
raise RuntimeError('Unknown dataset split.')
self.images = []
self.masks = []
with open(os.path.join(_split_f), "r") as lines:
for line in tqdm(lines):
_image = os.path.join(_image_dir, line.rstrip('\n')+".jpg")
assert os.path.isfile(_image)
self.images.append(_image)
if self.mode != 'test':
_mask = os.path.join(_mask_dir, line.rstrip('\n')+".png")
assert os.path.isfile(_mask)
self.masks.append(_mask)
if self.mode != 'test':
assert (len(self.images) == len(self.masks))
def __getitem__(self, index):
img = Image.open(self.images[index]).convert('RGB')
if self.mode == 'test':
if self.transform is not None:
img = self.transform(img)
return img, os.path.basename(self.images[index])
target = Image.open(self.masks[index])
# synchrosized transform
if self.mode == 'train':
img, target = self._sync_transform( img, target)
elif self.mode == 'val':
img, target = self._val_sync_transform( img, target)
else:
assert self.mode == 'testval'
mask = self._mask_transform(mask)
# general resize, normalize and toTensor
if self.transform is not None:
#print("transform for input")
img = self.transform(img)
if self.target_transform is not None:
#print("transform for label")
target = self.target_transform(target)
return img, target
def _mask_transform(self, mask):
target = np.array(mask).astype('int32')
target[target == 255] = -1
return torch.from_numpy(target).long()
def __len__(self):
return len(self.images)
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
from PIL import Image, ImageOps, ImageFilter
import os
import os.path
import math
import random
import numpy as np
import torch
from .base import BaseDataset
class ContextSegmentation(BaseDataset):
BASE_DIR = 'VOCdevkit/VOC2010'
NUM_CLASS = 59
def __init__(self, root=os.path.expanduser('~/.encoding/data'), split='train',
mode=None, transform=None, target_transform=None):
super(ContextSegmentation, self).__init__(
root, split, mode, transform, target_transform)
from detail import Detail
#from detail import mask
root = os.path.join(root, self.BASE_DIR)
annFile = os.path.join(root, 'trainval_merged.json')
imgDir = os.path.join(root, 'JPEGImages')
# training mode
if split == 'train':
phase = 'train'
elif split == 'val':
phase = 'val'
elif split == 'test':
phase = 'val'
#phase = 'test'
print('annFile', annFile)
print('imgDir', imgDir)
self.detail = Detail(annFile, imgDir, phase)
self.transform = transform
self.target_transform = target_transform
self.ids = self.detail.getImgs()
self._mapping = np.sort(np.array([
0, 2, 259, 260, 415, 324, 9, 258, 144, 18, 19, 22,
23, 397, 25, 284, 158, 159, 416, 33, 162, 420, 454, 295, 296,
427, 44, 45, 46, 308, 59, 440, 445, 31, 232, 65, 354, 424,
68, 326, 72, 458, 34, 207, 80, 355, 85, 347, 220, 349, 360,
98, 187, 104, 105, 366, 189, 368, 113, 115]))
self._key = np.array(range(len(self._mapping))).astype('uint8')
def _class_to_index(self, mask):
# assert the values
values = np.unique(mask)
#assert(values.size > 1)
for i in range(len(values)):
assert(values[i] in self._mapping)
index = np.digitize(mask.ravel(), self._mapping, right=True)
return self._key[index].reshape(mask.shape)
def __getitem__(self, index):
detail = self.detail
img_id = self.ids[index]
path = img_id['file_name']
iid = img_id['image_id']
img = Image.open(os.path.join(detail.img_folder, path)).convert('RGB')
if self.mode == 'test':
if self.transform is not None:
img = self.transform(img)
return img, os.path.basename(path)
# convert mask to 60 categories
mask = Image.fromarray(self._class_to_index(
detail.getMask(img_id)))
# synchrosized transform
if self.mode == 'train':
img, mask = self._sync_transform(img, mask)
elif self.mode == 'val':
img, mask = self._val_sync_transform(img, mask)
else:
assert self.mode == 'testval'
mask = self._mask_transform(mask)
# general resize, normalize and toTensor
if self.transform is not None:
#print("transform for input")
img = self.transform(img)
if self.target_transform is not None:
#print("transform for label")
mask = self.target_transform(mask)
return img, mask
def _mask_transform(self, mask):
target = np.array(mask).astype('int32') - 1
return torch.from_numpy(target).long()
def __len__(self):
return len(self.ids)
@property
def pred_offset(self):
return 1
"""Dilated ResNet and DenseNet""" """Dilated ResNet and DenseNet"""
from .resnet import * from .resnet import *
from .densenet import *
"""Dilated DenseNet"""
from collections import OrderedDict
import torch
import torch.utils.model_zoo as model_zoo
from .. import nn
from .. import functions as F
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
model_urls = {
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
}
def densenet121(pretrained=False, **kwargs):
r"""Densenet-121 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet121']))
return model
def densenet169(pretrained=False, **kwargs):
r"""Densenet-169 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet169']))
return model
def densenet201(pretrained=False, **kwargs):
r"""Densenet-201 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet201']))
return model
def densenet161(pretrained=False, **kwargs):
r"""Densenet-161 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
**kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['densenet161']))
return model
class _DenseLayer(nn.Sequential):
# pylint: disable=expression-not-assigned
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, dilation=1):
super(_DenseLayer, self).__init__()
self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
self.add_module('relu.1', nn.ReLU(inplace=True)),
self.add_module('conv.1', nn.Conv2d(
num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
self.add_module('relu.2', nn.ReLU(inplace=True)),
self.add_module('conv.2', nn.Conv2d(
bn_size * growth_rate, growth_rate, kernel_size=3, stride=1,
padding=dilation, dilation=dilation, bias=False)),
self.drop_rate = drop_rate
def forward(self, x):
new_features = super(_DenseLayer, self).forward(x)
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
return torch.cat([x, new_features], 1)
class _DenseBlock(nn.Sequential):
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, dilation=1):
super(_DenseBlock, self).__init__()
for i in range(num_layers):
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate, dilation=dilation)
self.add_module('denselayer%d' % (i + 1), layer)
class _Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features, stride, dilation=1):
super(_Transition, self).__init__()
self.add_module('norm', nn.BatchNorm2d(num_input_features))
self.add_module('relu', nn.ReLU(inplace=True))
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
kernel_size=1, stride=1, bias=False))
self.add_module('pool', nn.DilatedAvgPool2d(kernel_size=2, stride=stride,
dilation=dilation))
class DenseNet(nn.Module):
r"""Dilated DenseNet.
For correctly dilation of transition layer fo DenseNet, we implement the :class:`encoding.nn.DilatedAvgPool2d`.
Args:
growth_rate (int) - how many filters to add each layer (`k` in paper)
block_config (list of 4 ints) - how many layers in each pooling block
num_init_features (int) - the number of filters to learn in the first convolution layer
bn_size (int) - multiplicative factor for number of bottle neck layers
(i.e. bn_size * k features in the bottleneck layer)
drop_rate (float) - dropout rate after each dense layer
num_classes (int) - number of classification classes
Reference:
Huang, Gao, et al. "Densely Connected Convolutional Networks" *CVPR 2017*
"""
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
super(DenseNet, self).__init__()
# First convolution
self.features = nn.Sequential(OrderedDict([
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
('norm0', nn.BatchNorm2d(num_init_features)),
('relu0', nn.ReLU(inplace=True)),
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
]))
# Each denseblock
strides = [1, 2, 1, 1]
dilations = [1, 1, 2, 4]
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate,
dilation=dilations[i])
self.features.add_module('denseblock%d' % (i + 1), block)
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1:
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2, stride=strides[i+1], dilation=dilations[i])
self.features.add_module('transition%d' % (i + 1), trans)
num_features = num_features // 2
# Final batch norm
self.features.add_module('norm5', nn.BatchNorm2d(num_features))
# Linear layer
self.classifier = nn.Linear(num_features, num_classes)
def forward(self, x):
features = self.features(x)
out = F.relu(features, inplace=True)
# out = F.avg_pool2d(out, kernel_size=7).view(features.size(0), -1)
# out = self.classifier(out)
return out
...@@ -26,15 +26,15 @@ class BasicBlock(nn.Module): ...@@ -26,15 +26,15 @@ class BasicBlock(nn.Module):
"""ResNet BasicBlock """ResNet BasicBlock
""" """
expansion = 1 expansion = 1
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1, def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, previous_dilation=1,
norm_layer=None): norm_layer=None):
super(BasicBlock, self).__init__() super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, bias=False) padding=dilation, dilation=dilation, bias=False)
self.bn1 = norm_layer(planes) self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=False) self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
padding=first_dilation, dilation=first_dilation, bias=False) padding=previous_dilation, dilation=previous_dilation, bias=False)
self.bn2 = norm_layer(planes) self.bn2 = norm_layer(planes)
self.downsample = downsample self.downsample = downsample
self.stride = stride self.stride = stride
...@@ -64,7 +64,7 @@ class Bottleneck(nn.Module): ...@@ -64,7 +64,7 @@ class Bottleneck(nn.Module):
# pylint: disable=unused-argument # pylint: disable=unused-argument
expansion = 4 expansion = 4
def __init__(self, inplanes, planes, stride=1, dilation=1, def __init__(self, inplanes, planes, stride=1, dilation=1,
downsample=None, first_dilation=1, norm_layer=None): downsample=None, previous_dilation=1, norm_layer=None):
super(Bottleneck, self).__init__() super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = norm_layer(planes) self.bn1 = norm_layer(planes)
...@@ -75,7 +75,7 @@ class Bottleneck(nn.Module): ...@@ -75,7 +75,7 @@ class Bottleneck(nn.Module):
self.conv3 = nn.Conv2d( self.conv3 = nn.Conv2d(
planes, planes * 4, kernel_size=1, bias=False) planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = norm_layer(planes * 4) self.bn3 = norm_layer(planes * 4)
self.relu = nn.ReLU(inplace=False) self.relu = nn.ReLU(inplace=True)
self.downsample = downsample self.downsample = downsample
self.dilation = dilation self.dilation = dilation
self.stride = stride self.stride = stride
...@@ -113,6 +113,21 @@ class Bottleneck(nn.Module): ...@@ -113,6 +113,21 @@ class Bottleneck(nn.Module):
class ResNet(nn.Module): class ResNet(nn.Module):
"""Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5. """Dilated Pre-trained ResNet Model, which preduces the stride of 8 featuremaps at conv5.
Parameters
----------
block : Block
Class for the residual block. Options are BasicBlockV1, BottleneckV1.
layers : list of int
Numbers of layers in each block
classes : int, default 1000
Number of classification classes.
dilated : bool, default False
Applying dilation strategy to pretrained ResNet yielding a stride-8 model,
typically used in Semantic Segmentation.
norm_layer : object
Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
for Synchronized Cross-GPU BachNormalization).
Reference: Reference:
- He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016. - He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
...@@ -120,18 +135,26 @@ class ResNet(nn.Module): ...@@ -120,18 +135,26 @@ class ResNet(nn.Module):
- Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions." - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
""" """
# pylint: disable=unused-variable # pylint: disable=unused-variable
def __init__(self, block, layers, num_classes=1000, norm_layer=None): def __init__(self, block, layers, num_classes=1000, dilated=True, norm_layer=nn.BatchNorm2d):
self.inplanes = 64 self.inplanes = 64
super(ResNet, self).__init__() super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False) bias=False)
self.bn1 = norm_layer(64) self.bn1 = norm_layer(64)
self.relu = nn.ReLU(inplace=False) self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer) self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer) if dilated:
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer) self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
dilation=2, norm_layer=norm_layer)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
dilation=4, norm_layer=norm_layer)
else:
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
norm_layer=norm_layer)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
norm_layer=norm_layer)
self.avgpool = nn.AvgPool2d(7) self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes) self.fc = nn.Linear(512 * block.expansion, num_classes)
...@@ -155,16 +178,16 @@ class ResNet(nn.Module): ...@@ -155,16 +178,16 @@ class ResNet(nn.Module):
layers = [] layers = []
if dilation == 1 or dilation == 2: if dilation == 1 or dilation == 2:
layers.append(block(self.inplanes, planes, stride, dilation=1, layers.append(block(self.inplanes, planes, stride, dilation=1,
downsample=downsample, first_dilation=dilation, norm_layer=norm_layer)) downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
elif dilation == 4: elif dilation == 4:
layers.append(block(self.inplanes, planes, stride, dilation=2, layers.append(block(self.inplanes, planes, stride, dilation=2,
downsample=downsample, first_dilation=dilation, norm_layer=norm_layer)) downsample=downsample, previous_dilation=dilation, norm_layer=norm_layer))
else: else:
raise RuntimeError("=> unknown dilation size: {}".format(dilation)) raise RuntimeError("=> unknown dilation size: {}".format(dilation))
self.inplanes = planes * block.expansion self.inplanes = planes * block.expansion
for i in range(1, blocks): for i in range(1, blocks):
layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation, layers.append(block(self.inplanes, planes, dilation=dilation, previous_dilation=dilation,
norm_layer=norm_layer)) norm_layer=norm_layer))
return nn.Sequential(*layers) return nn.Sequential(*layers)
......
"""Encoding Autograd Fuctions""" """Encoding Autograd Fuctions"""
from .encoding import * from .encoding import *
from .syncbn import * from .syncbn import *
from .customize import *
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Encoding Customized Functions"""
import math
import torch
from torch.autograd import Function, Variable
from torch.nn.modules.utils import _pair
from .._ext import encoding_lib
__all__ = ['dilatedavgpool2d']
class _dilatedavgpool2d(Function):
@staticmethod
def forward(ctx, input, kernel_size, stride, padding,
dilation=1):
ctx.kH, ctx.kW = _pair(kernel_size)
ctx.dH, ctx.dW = _pair(stride if stride is not None else kernel_size)
ctx.padH, ctx.padW = _pair(padding)
ctx.dilationH, ctx.dilationW = _pair(dilation)
b, c, h, w = input.size()
if ctx.dH == 1 and ctx.dW == 1:
# keep the size for dilated avgpool
ow, oh = w, h
else:
ow = math.floor(float(w-ctx.kW+2*ctx.padW)/float(ctx.dW)) +1
oh = math.floor(float(h-ctx.kH+2*ctx.padH)/float(ctx.dH)) +1
with torch.cuda.device_of(input):
output = input.new(b, c, oh, ow)
ctx.save_for_backward(input)
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_DilatedAvgPool2d_Forward(
input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
ctx.padW, ctx.dilationH, ctx.dilationW)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_DilatedAvgPool2d_Forward(
input, output, ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH,
ctx.padW, ctx.dilationH, ctx.dilationW)
else:
raise RuntimeError('Unimplemented data type!')
return output
@staticmethod
def backward(ctx, gradOutput):
input, = ctx.saved_variables
with torch.cuda.device_of(input):
gradInput = Variable(input.data.new().resize_as_(input.data))
if isinstance(input.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Float_DilatedAvgPool2d_Backward(
gradInput.data, gradOutput.data,
ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
ctx.dilationH, ctx.dilationW)
elif isinstance(input.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Double_DilatedAvgPool2d_Backward(
gradInput.data, gradOutput.data,
ctx.kH, ctx.kW, ctx.dH, ctx.dW, ctx.padH, ctx.padW,
ctx.dilationH, ctx.dilationW)
else:
raise RuntimeError('Unimplemented data type!')
return gradInput, None, None, None, None
def dilatedavgpool2d(input, kernel_size, stride=None, padding=0,
dilation=1):
"""Dilated Average Pool 2d, for dilation of DenseNet.
Reference:
Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang,
Ambrish Tyagi, Amit Agrawal. “Context Encoding for Semantic Segmentation. CVPR 2018
Applies 2D average-pooling operation in kh x kw regions by step size
dh x dw steps. The number of output features is equal to the number of
input planes.
See :class:`~encoding.nn.DilatedAvgPool2d` for details and output shape.
Args:
input: input tensor (minibatch x in_channels x iH x iW)
kernel_size: size of the pooling region, a single number or a
tuple (kh x kw)
stride: stride of the pooling operation, a single number or a
tuple (sh x sw). Default is equal to kernel size
padding: implicit zero padding on the input, a single number or
a tuple (padh x padw), Default: 0
dilation: the dilation parameter similar to Conv2d
"""
return _dilatedavgpool2d.apply(input, kernel_size, stride, padding, dilation)
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang ## Created by: Hang Zhang
## ECE Department, Rutgers University ## Email: zhanghang0704@gmail.com
## Email: zhang.hang@rutgers.edu ## Copyright (c) 2018
## Copyright (c) 2017
## ##
## This source code is licensed under the MIT-style license found in the ## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree ## LICENSE file in the root directory of this source tree
...@@ -11,7 +10,7 @@ ...@@ -11,7 +10,7 @@
"""Functions for Encoding Layer""" """Functions for Encoding Layer"""
import torch import torch
from torch.autograd import Function, Variable from torch.autograd import Function, Variable
from .._ext import encoding_lib from .. import lib
__all__ = ['aggregate', 'scaledL2'] __all__ = ['aggregate', 'scaledL2']
...@@ -20,47 +19,27 @@ class _aggregate(Function): ...@@ -20,47 +19,27 @@ class _aggregate(Function):
def forward(ctx, A, X, C): def forward(ctx, A, X, C):
# A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD) # A \in(BxNxK) R \in(BxNxKxD) => E \in(BxNxD)
ctx.save_for_backward(A, X, C) ctx.save_for_backward(A, X, C)
B, _, K = A.size() if A.is_cuda:
D = X.size(2) E = lib.gpu.aggregate_forward(A, X, C)
with torch.cuda.device_of(A):
E = A.new(B, K, D)
if isinstance(A, torch.cuda.FloatTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Float_aggregate_forward(E, A, X, C)
elif isinstance(A, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A):
encoding_lib.Encoding_Double_aggregate_forward(E, A, X, C)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
return E return E
@staticmethod @staticmethod
def backward(ctx, gradE): def backward(ctx, gradE):
A, X, C = ctx.saved_variables A, X, C = ctx.saved_variables
with torch.cuda.device_of(A): if A.is_cuda:
gradA = Variable(A.data.new().resize_as_(A.data)) gradA, gradX, gradC = lib.gpu.aggregate_backward(gradE, A, X, C)
gradX = Variable(A.data.new().resize_as_(X.data))
gradC = Variable(A.data.new().resize_as_(C.data))
if isinstance(A.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(A.data):
encoding_lib.Encoding_Float_aggregate_backward(gradA.data, \
gradE.data, A.data, X.data, C.data)
elif isinstance(A.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(A.data):
encoding_lib.Encoding_Double_aggregate_backward(gradA.data, \
gradE.data, A.data, X.data, C.data)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
gradX.data.copy_(torch.bmm(A, gradE).data)
gradC.data.copy_((-gradE*A.sum(1).unsqueeze(2)).sum(0).data)
return gradA, gradX, gradC return gradA, gradX, gradC
def aggregate(A, X, C): def aggregate(A, X, C):
r""" r""" Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
Aggregate operation, aggregate the residuals of inputs (:math:`X`) with repect
to the codewords (:math:`C`) with assignment weights (:math:`A`). to the codewords (:math:`C`) with assignment weights (:math:`A`).
.. math:: .. math::
e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k) e_{k} = \sum_{i=1}^{N} a_{ik} (x_i - d_k)
Shape: Shape:
...@@ -77,53 +56,31 @@ def aggregate(A, X, C): ...@@ -77,53 +56,31 @@ def aggregate(A, X, C):
>>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True) >>> C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), requires_grad=True)
>>> func = encoding.aggregate() >>> func = encoding.aggregate()
>>> E = func(A, X, C) >>> E = func(A, X, C)
""" """
return _aggregate.apply(A, X, C) return _aggregate.apply(A, X, C)
class _scaledL2(Function): class _scaledL2(Function):
@staticmethod @staticmethod
def forward(ctx, X, C, S): def forward(ctx, X, C, S):
B, N, _ = X.size() if X.is_cuda:
K = C.size(0) SL = lib.gpu.scaled_l2_forward(X, C, S)
with torch.cuda.device_of(X):
SL = X.new(B, N, K)
if isinstance(X, torch.cuda.FloatTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Float_scaledl2_forward(SL, X, C, S)
elif isinstance(X, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X):
encoding_lib.Encoding_Double_scaledl2_forward(SL, X, C, S)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
ctx.save_for_backward(X, C, S, SL) ctx.save_for_backward(X, C, S, SL)
return SL return SL
@staticmethod @staticmethod
def backward(ctx, gradSL): def backward(ctx, gradSL):
X, C, S, SL = ctx.saved_variables X, C, S, SL = ctx.saved_variables
K = C.size(0) if X.is_cuda:
with torch.cuda.device_of(X.data): gradX, gradC, gradS = lib.gpu.scaled_l2_backward(gradSL, X, C, S, SL)
gradX = Variable(X.data.new().resize_as_(X.data))
gradC = Variable(X.data.new().resize_as_(C.data))
gradS = Variable(X.data.new().resize_as_(S.data))
if isinstance(X.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(X.data):
encoding_lib.Encoding_Float_scaledl2_backward(gradSL.data, \
gradX.data, gradC.data, X.data, C.data, S.data)
elif isinstance(X.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(X.data):
encoding_lib.Encoding_Double_scaledl2_backward(gradSL.data, \
gradX.data, gradC.data, X.data, C.data, S.data)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
gradS.data.copy_((gradSL*(SL/S.view(1, 1, K))).sum(0).sum(0).data)
return gradX, gradC, gradS return gradX, gradC, gradS
def scaledL2(X, C, S): def scaledL2(X, C, S):
r""" r""" scaledL2 distance
scaledL2 distance
.. math:: .. math::
sl_{ik} = s_k \|x_i-c_k\|^2 sl_{ik} = s_k \|x_i-c_k\|^2
...@@ -134,6 +91,5 @@ def scaledL2(X, C, S): ...@@ -134,6 +91,5 @@ def scaledL2(X, C, S):
(where :math:`B` is batch, :math:`N` is total number of features, (where :math:`B` is batch, :math:`N` is total number of features,
:math:`K` is number is codewords, :math:`D` is feature dimensions.) :math:`K` is number is codewords, :math:`D` is feature dimensions.)
- Output: :math:`E\in\mathcal{R}^{B\times N\times K}` - Output: :math:`E\in\mathcal{R}^{B\times N\times K}`
""" """
return _scaledL2.apply(X, C, S) return _scaledL2.apply(X, C, S)
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang ## Created by: Hang Zhang
## ECE Department, Rutgers University ## Email: zhanghang0704@gmail.com
## Email: zhang.hang@rutgers.edu ## Copyright (c) 2018
## Copyright (c) 2017
## ##
## This source code is licensed under the MIT-style license found in the ## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree ## LICENSE file in the root directory of this source tree
...@@ -11,9 +10,9 @@ ...@@ -11,9 +10,9 @@
"""Synchronized Cross-GPU Batch Normalization functions""" """Synchronized Cross-GPU Batch Normalization functions"""
import torch import torch
from torch.autograd import Variable, Function from torch.autograd import Variable, Function
from .._ext import encoding_lib from .. import lib
__all__ = ['sum_square', 'batchnormtrain', 'batchnormeval'] __all__ = ['sum_square', 'batchnormtrain']
def sum_square(input): def sum_square(input):
r"""Calculate sum of elements and sum of squares for Batch Normalization""" r"""Calculate sum of elements and sum of squares for Batch Normalization"""
...@@ -24,91 +23,45 @@ class _sum_square(Function): ...@@ -24,91 +23,45 @@ class _sum_square(Function):
@staticmethod @staticmethod
def forward(ctx, input): def forward(ctx, input):
ctx.save_for_backward(input) ctx.save_for_backward(input)
C = input.size(1) if input.is_cuda:
with torch.cuda.device_of(input): xsum, xsqusum = lib.gpu.sumsquare_forward(input)
xsum = input.new().resize_(C).zero_()
xsquare = input.new().resize_(C).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_sum_square_Forward(
input, xsum, xsquare)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_sum_square_Forward(
input, xsum, xsquare)
else: else:
raise RuntimeError('Unimplemented data type!', type(input)) raise NotImplemented
return xsum, xsquare return xsum, xsqusum
@staticmethod @staticmethod
def backward(ctx, gradSum, gradSquare): def backward(ctx, gradSum, gradSquare):
input, = ctx.saved_variables input, = ctx.saved_variables
with torch.cuda.device_of(input.data): if input.is_cuda:
gradInput = Variable(input.data.new().resize_as_(input.data).zero_()) gradInput = lib.gpu.sumsquare_backward(input, gradSum, gradSquare)
if isinstance(input.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Float_sum_square_Backward(
gradInput.data, input.data, gradSum.data, gradSquare.data)
elif isinstance(input.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Double_sum_square_Backward(
gradInput.data, input.data, gradSum.data, gradSquare.data)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
return gradInput return gradInput
class _batchnorm(Function): class _batchnormtrain(Function):
def __init__(self, training=False): @staticmethod
super(_batchnorm, self).__init__() def forward(ctx, input, mean, std, gamma, beta):
self.training = training ctx.save_for_backward(input, mean, std, gamma, beta)
if input.is_cuda:
def forward(self, input, gamma, beta, mean, std): output = lib.gpu.batchnorm_forward(input, mean, std, gamma, beta)
self.save_for_backward(input, gamma, beta, mean, std)
assert(input.dim() == 3)
with torch.cuda.device_of(input):
invstd = 1.0 / std
output = input.new().resize_as_(input)
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_batchnorm_Forward(output, \
input, mean, invstd, gamma, beta)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_batchnorm_Forward(output, \
input, mean, invstd, gamma, beta)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
return output return output
def backward(self, gradOutput): @staticmethod
input, gamma, beta, mean, std = self.saved_tensors def backward(ctx, gradOutput):
invstd = 1.0 / std input, mean, std, gamma, beta = ctx.saved_variables
with torch.cuda.device_of(input): if gradOutput.is_cuda:
gradInput = gradOutput.new().resize_as_(input).zero_() gradInput, gradMean, gradStd, gradGamma, gradBeta = \
gradGamma = gradOutput.new().resize_as_(gamma).zero_() lib.gpu.batchnorm_backward(gradOutput, input, mean,
gradBeta = gradOutput.new().resize_as_(beta).zero_() std, gamma, beta, True)
gradMean = gradOutput.new().resize_as_(mean).zero_()
gradStd = gradOutput.new().resize_as_(std).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_batchnorm_Backward(
gradOutput, input, gradInput, gradGamma, gradBeta,
mean, invstd, gamma, beta, gradMean, gradStd,
self.training)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_batchnorm_Backward(
gradOutput, input, gradInput, gradGamma, gradBeta,
mean, invstd, gamma, beta, gradMean, gradStd,
self.training)
else: else:
raise RuntimeError('Unimplemented data type!') raise NotImplemented
return gradInput, gradGamma, gradBeta, gradMean, gradStd return gradInput, gradMean, gradStd, gradGamma, gradBeta
def batchnormtrain(input, gamma, beta, mean, std): def batchnormtrain(input, mean, std, gamma, beta):
r"""Applies Batch Normalization over a 3d input that is seen as a r"""Applies Batch Normalization over a 3d input that is seen as a
mini-batch. mini-batch.
...@@ -123,14 +76,4 @@ def batchnormtrain(input, gamma, beta, mean, std): ...@@ -123,14 +76,4 @@ def batchnormtrain(input, gamma, beta, mean, std):
- Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input) - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
""" """
return _batchnorm(True)(input, gamma, beta, mean, std) return _batchnormtrain.apply(input, mean, std, gamma, beta)
def batchnormeval(input, gamma, beta, mean, std):
r"""Applies Batch Normalization over a 3d input that is seen as a
mini-batch.
Please see encoding.batchnormtrain_
"""
return _batchnorm(False)(input, gamma, beta, mean, std)
// The maximum number of threads in a block
const int WARP_SIZE = 32;
const int MAX_BLOCK_SIZE = 512;
// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
static int getNumThreads(int nElem) {
int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
for (int i = 0; i != 5; ++i) {
if (nElem <= threadSizes[i]) {
return threadSizes[i];
}
}
return MAX_BLOCK_SIZE;
}
__device__ __forceinline__ int getMSB(int val) {
return 31 - __clz(val);
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/device_tensor.h"
#else
template <int Dim>
THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
if (!t) {
return THCDeviceTensor<real, Dim>();
}
int inDim = THCTensor_(nDimension)(state, t);
return toDeviceTensor<real, Dim>(state, t);
/*
if (inDim == Dim) {
return toDeviceTensor<real, Dim>(state, t);
}
// View in which the last dimensions are collapsed or expanded as needed
THAssert(THCTensor_(isContiguous)(state, t));
int size[Dim];
for (int i = 0; i < Dim || i < inDim; ++i) {
if (i < Dim && i < inDim) {
size[i] = t->size[i];
} else if (i < Dim) {
size[i] = 1;
} else {
size[Dim - 1] *= t->size[i];
}
}
return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
*/
}
struct Encoding_(Float2)
/*
* For reduce sum calcualtion of two elements
*/
{
real v1, v2;
__device__ Encoding_(Float2)() {}
__device__ Encoding_(Float2)(real x1, real x2) : v1(x1), v2(x2) {}
__device__ Encoding_(Float2)(real v) : v1(v), v2(v) {}
__device__ Encoding_(Float2)(int v) : v1(v), v2(v) {}
__device__ Encoding_(Float2)& operator+=(const Encoding_(Float2)& a)
{
v1 += a.v1;
v2 += a.v2;
return *this;
}
};
static __device__ __forceinline__ real Encoding_(rwarpSum)(real val) {
#if CUDA_VERSION >= 9000
unsigned int mask = 0xffffffff;
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += __shfl_xor_sync(mask, val, 1 << i, WARP_SIZE);
}
#else
#if __CUDA_ARCH__ >= 300
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += __shfl_xor(val, 1 << i, WARP_SIZE);
}
#else
__shared__ real values[MAX_BLOCK_SIZE];
values[threadIdx.x] = val;
__threadfence_block();
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
for (int i = 1; i < WARP_SIZE; i++) {
val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
}
#endif
#endif
return val;
}
static __device__ __forceinline__ Encoding_(Float2) Encoding_(warpSum)(
Encoding_(Float2) value)
{
value.v1 = Encoding_(rwarpSum)(value.v1);
value.v2 = Encoding_(rwarpSum)(value.v2);
return value;
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/encoding_kernel.c"
#else
__global__ void Encoding_(Aggregate_Forward_kernel) (
THCDeviceTensor<real, 3> E,
THCDeviceTensor<real, 3> A,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, d, N;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x;
k = blockIdx.y;
N = X.getSize(1);
/* main operation */
Encoding_(AggOp) g(A,X,C);
E[b][k][d] = Encoding_(reduce_agg)(g,b,k,d,N);
}
void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_,
THCTensor *A_, THCTensor *X_, THCTensor *C_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, E_, A_, X_, C_);
if (THCTensor_(nDimension)(state, E_) != 3 ||
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> E = devicetensor<3>(state, E_);
THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
// B, K, D
dim3 blocks(C.getSize(1), C.getSize(0), X.getSize(0));
// N
dim3 threads(getNumThreads(X.getSize(1)));
Encoding_(Aggregate_Forward_kernel)<<<blocks, threads, 0, stream>>>
(E, A, X, C);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(Aggregate_Backward_kernel) (
THCDeviceTensor<real, 3> GA,
THCDeviceTensor<real, 3> GE,
THCDeviceTensor<real, 3> A,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C)
/*
* aggregating backward kernel function
* G (dl/dR), L (dl/dE), A
*/
{
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y;
k = blockIdx.x;
D = GE.getSize(2);
/* main operation */
Encoding_(AggBackOp) g(GE,X,C);
GA[b][i][k] = Encoding_(reduce_aggback)(g,b,i,k,D);
}
void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_,
THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_)
/*
* aggregate backward to assignment weights
* G (dl/dR), L (dl/dE), A
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 5, GA_, GE_, A_, X_, C_);
if (THCTensor_(nDimension)(state, GA_) != 3 ||
THCTensor_(nDimension)(state, GE_) != 3 ||
THCTensor_(nDimension)(state, A_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GA = devicetensor<3>(state, GA_);
THCDeviceTensor<real, 3> GE = devicetensor<3>(state, GE_);
THCDeviceTensor<real, 3> A = devicetensor<3>(state, A_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
// B, K, D
dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
// N
dim3 threads(getNumThreads(C.getSize(1)));
Encoding_(Aggregate_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GA, GE, A, X, C);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(ScaledL2_Forward_kernel) (
THCDeviceTensor<real, 3> SL,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
* aggregating forward kernel function
*/
{
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x;
i = blockIdx.y;
D = X.getSize(2);
/* main operation */
Encoding_(L2Op) g(X,C);
SL[b][i][k] = S[k] * Encoding_(reduce_sl2)(g,b,i,k,D);;
}
void Encoding_(ScaledL2_Forward)(
THCState *state, THCTensor *SL_, THCTensor *X_,
THCTensor *C_, THCTensor *S_)
/*
* aggregating forward the residuals with assignment weights
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, SL_, X_, C_, S_);
if (THCTensor_(nDimension)(state, SL_) != 3 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2 ||
THCTensor_(nDimension)(state, S_) != 1)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> SL = devicetensor<3>(state, SL_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(C.getSize(0), X.getSize(1), X.getSize(0));
dim3 threads(getNumThreads(C.getSize(1)));
Encoding_(ScaledL2_Forward_kernel)<<<blocks, threads, 0, stream>>>
(SL, X, C, S);
THCudaCheck(cudaGetLastError());
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
__global__ void Encoding_(ScaledL2X_Backward_kernel) (
THCDeviceTensor<real, 3> GSL,
THCDeviceTensor<real, 3> GX,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
*/
{
/* declarations of the variables */
int b, d, i, K;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x;
i = blockIdx.y;
K = C.getSize(0);
/* main operation */
Encoding_(L2XBackOp) g(GSL,X,C,S);
GX[b][i][d] = Encoding_(reduce_sl2xback)(g,b,i,d,K);
}
__global__ void Encoding_(ScaledL2C_Backward_kernel) (
THCDeviceTensor<real, 3> GSL,
THCDeviceTensor<real, 2> GC,
THCDeviceTensor<real, 3> X,
THCDeviceTensor<real, 2> C,
THCDeviceTensor<real, 1> S)
/*
*/
{
/* declarations of the variables */
int k, d, B, N;
/* Get the index and channels */
d = blockIdx.x;
k = blockIdx.y;
B = X.getSize(0);
N = X.getSize(1);
/* main operation */
Encoding_(L2CBackOp) g(GSL,X,C,S);
GC[k][d] = Encoding_(reduce_sl2cback)(g,k,d,B,N);
}
void Encoding_(ScaledL2_Backward)(
THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
THCTensor *X_, THCTensor *C_, THCTensor *S_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, GSL_, GX_, GC_, X_, C_, S_);
if (THCTensor_(nDimension)(state, GSL_) != 3 ||
THCTensor_(nDimension)(state, GX_) != 3 ||
THCTensor_(nDimension)(state, GC_) != 2 ||
THCTensor_(nDimension)(state, X_) != 3 ||
THCTensor_(nDimension)(state, C_) != 2 ||
THCTensor_(nDimension)(state, S_) != 1)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> GSL = devicetensor<3>(state, GSL_);
THCDeviceTensor<real, 3> GX = devicetensor<3>(state, GX_);
THCDeviceTensor<real, 2> GC = devicetensor<2>(state, GC_);
THCDeviceTensor<real, 3> X = devicetensor<3>(state, X_);
THCDeviceTensor<real, 2> C = devicetensor<2>(state, C_);
THCDeviceTensor<real, 1> S = devicetensor<1>(state, S_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(X.getSize(2), X.getSize(1), X.getSize(0));
dim3 threads(getNumThreads(C.getSize(0)));
Encoding_(ScaledL2X_Backward_kernel)<<<blocks, threads, 0, stream>>>
(GSL, GX, X, C, S);
THCudaCheck(cudaGetLastError());
dim3 blocks2(C.getSize(1), C.getSize(0));
dim3 threads2(getNumThreads(X.getSize(1)));
Encoding_(ScaledL2C_Backward_kernel)<<<blocks2, threads2, 0, stream>>>
(GSL, GC, X, C, S);
THCudaCheck(cudaGetLastError());
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/encoding_kernel.h"
#else
void Encoding_(Aggregate_Forward)(THCState *state, THCTensor *E_,
THCTensor *A_, THCTensor *X_, THCTensor *C_);
void Encoding_(Aggregate_Backward)(THCState *state, THCTensor *GA_,
THCTensor *GE_, THCTensor *A_, THCTensor *X_, THCTensor *C_);
void Encoding_(ScaledL2_Forward)( THCState *state, THCTensor *SL_,
THCTensor *X_, THCTensor *C_, THCTensor *S_);
void Encoding_(ScaledL2_Backward)(
THCState *state, THCTensor *GSL_, THCTensor *GX_, THCTensor *GC_,
THCTensor *X_, THCTensor *C_, THCTensor *S_);
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/encoding_utils.c"
#else
struct Encoding_(AggOp) {
__device__ Encoding_(AggOp)(THCDeviceTensor<real, 3> a,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c)
: A(a), X(x), C(c) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return A[b][i][k].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> A;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
};
__device__ real Encoding_(reduce_agg)(
Encoding_(AggOp) op,
int b, int k, int d, int N)
{
real sum = 0;
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(b,x,k,d);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(AggBackOp) {
__device__ Encoding_(AggBackOp)(THCDeviceTensor<real, 3> ge,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c)
: GE(ge), X(x), C(c) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return GE[b][k][d].ldg() * (X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> GE;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
};
__device__ real Encoding_(reduce_aggback)(
Encoding_(AggBackOp) op,
int b, int i, int k, int D)
{
real sum = 0;
for (int x = threadIdx.x; x < D; x += blockDim.x) {
sum += op(b,i,k,x);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(L2Op) {
__device__ Encoding_(L2Op)(THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c)
: X(x), C(c) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
real r = X[b][i][d].ldg() - C[k][d].ldg();
return r * r;
}
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
};
__device__ real Encoding_(reduce_sl2)(
Encoding_(L2Op) op,
int b, int i, int k, int D)
{
real sum = 0;
for (int x = threadIdx.x; x < D; x += blockDim.x) {
sum += op(b,i,k,x);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(L2XBackOp) {
__device__ Encoding_(L2XBackOp)(
THCDeviceTensor<real, 3> gsl,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c,
THCDeviceTensor<real, 1> s
) : GSL(gsl), X(x), C(c), S(s) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return 2*S[k].ldg() * GSL[b][i][k].ldg() *
(X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> GSL;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
THCDeviceTensor<real, 1> S;
};
__device__ real Encoding_(reduce_sl2xback)(
Encoding_(L2XBackOp) op,
int b, int i, int d, int K)
{
real sum = 0;
for (int x = threadIdx.x; x < K; x += blockDim.x) {
sum += op(b,i,x,d);
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
struct Encoding_(L2CBackOp) {
__device__ Encoding_(L2CBackOp)(
THCDeviceTensor<real, 3> gsl,
THCDeviceTensor<real, 3> x,
THCDeviceTensor<real, 2> c,
THCDeviceTensor<real, 1> s
) : GSL(gsl), X(x), C(c), S(s) {}
__device__ __forceinline__ real operator()(int b, int i, int k, int d)
{
return -2*S[k].ldg() * GSL[b][i][k].ldg() *
(X[b][i][d].ldg()-C[k][d].ldg());
}
THCDeviceTensor<real, 3> GSL;
THCDeviceTensor<real, 3> X;
THCDeviceTensor<real, 2> C;
THCDeviceTensor<real, 1> S;
};
__device__ real Encoding_(reduce_sl2cback)(
Encoding_(L2CBackOp) op,
int k, int d, int B, int N)
{
real sum = 0;
for (int batch = 0; batch < B; ++batch) {
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(batch,x,k,d);
}
}
// sum over NumThreads within a warp
sum = Encoding_(rwarpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ real shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (real) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(rwarpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/pooling_kernel.c"
#else
__global__ void Encoding_(DilatedAvgPool_Forward_kernel) (
THCDeviceTensor<real, 4> X,
THCDeviceTensor<real, 4> Y,
int kH, int kW, int dH, int dW,
int padH, int padW, int dilationH, int dilationW
)
/*
* dilated avgpool2d forward kernel function
*/
{
/* declarations of the variables */
int bc, b, c, w, h, C;
real sum;
/* Get the index and channels */
bc = blockIdx.z;
w = blockIdx.x * blockDim.x + threadIdx.x;
h = blockIdx.y * blockDim.y + threadIdx.y;
C = Y.getSize(1);
b = bc / C;
c = bc - b*C;
/* boundary check for output */
if (w >= Y.getSize(3) || h >= Y.getSize(2)) return;
int hstart = h*dH -padH;
int wstart = w*dW -padW;
int hend = min(hstart + kH*dilationH, X.getSize(2));
int wend = min(wstart + kW*dilationW, X.getSize(3));
hstart = max(hstart, 0);
wstart = max(wstart, 0);
int pool_size = ((hend - hstart - 1) / dilationH + 1) *
((wend - wstart - 1) / dilationW + 1);
sum = 0;
for (int th=hstart; th < hend; th+=dilationH) {
for (int tw=wstart; tw < wend; tw+=dilationW) {
sum += X[b][c][th][tw];
}
}
Y[b][c][h][w] = sum / pool_size;
}
void Encoding_(DilatedAvgPool_Forward)(THCState *state,
THCTensor *X_, THCTensor *Y_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW)
/*
* dilated avgpool2d forward function
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, X_, Y_);
if (THCTensor_(nDimension)(state, X_) != 4 ||
THCTensor_(nDimension)(state, Y_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 4> X = devicetensor<4>(state, X_);
THCDeviceTensor<real, 4> Y = devicetensor<4>(state, Y_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(Y.getSize(3)/16+1, Y.getSize(2)/16+1,
Y.getSize(1)*Y.getSize(0));
Encoding_(DilatedAvgPool_Forward_kernel)<<<blocks, threads, 0, stream>>>
(X, Y, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
THCudaCheck(cudaGetLastError());
}
__global__ void Encoding_(DilatedAvgPool_Backward_kernel) (
THCDeviceTensor<real, 4> gradX,
THCDeviceTensor<real, 4> gradY,
int kH, int kW, int dH, int dW,
int padH, int padW, int dilationH, int dilationW
)
/*
* dilated avgpool2d forward kernel function
*/
{
/* declarations of the variables */
int bc, b, c, w, h, C;
real sum;
/* Get the index and channels */
bc = blockIdx.z;
w = blockIdx.x * blockDim.x + threadIdx.x;
h = blockIdx.y * blockDim.y + threadIdx.y;
C = gradX.getSize(1);
b = bc / C;
c = bc - b*C;
/* boundary check for output */
if (w >= gradX.getSize(3) || h >= gradX.getSize(2)) return;
int phstart = (h + padH < ((kH-1)*dilationH+1)) ? 0 :
(h + padH - ((kH-1)*dilationH+1))/dH + 1;
int pwstart = (w + padW < ((kW-1)*dilationW+1)) ? 0 :
(w + padW - ((kW-1)*dilationW+1))/dW + 1;
int phend = min((h+padH)/dH+1, gradY.getSize(2));
int pwend = min((w+padW)/dW+1, gradY.getSize(3));
sum = 0;
int hstart, wstart, hend, wend, pool_size;
for (int ph=phstart; ph < phend; ++ph) {
for (int pw=pwstart; pw < pwend; ++pw) {
hstart = ph*dW -padH;
wstart = pw*dW -padW;
hend = min(hstart + kH*dilationH, gradX.getSize(2));
wend = min(wstart + kW*dilationW, gradX.getSize(3));
hstart = max(hstart, 0);
wstart = max(wstart, 0);
pool_size = ((hend - hstart - 1) / dilationH + 1) *
((wend - wstart - 1) / dilationW + 1);
sum += gradY[b][c][ph][pw] / pool_size;
}
}
gradX[b][c][h][w] = sum;
}
void Encoding_(DilatedAvgPool_Backward)(THCState *state,
THCTensor *gradX_, THCTensor *gradY_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW)
/*
* dilated avgpool2d forward function
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 2, gradX_, gradY_);
if (THCTensor_(nDimension)(state, gradX_) != 4 ||
THCTensor_(nDimension)(state, gradY_) != 4)
THError("Encoding: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 4> gradX = devicetensor<4>(state, gradX_);
THCDeviceTensor<real, 4> gradY = devicetensor<4>(state, gradY_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 threads(16, 16);
dim3 blocks(gradX.getSize(3)/16+1, gradX.getSize(2)/16+1,
gradX.getSize(1)*gradX.getSize(0));
Encoding_(DilatedAvgPool_Backward_kernel)<<<blocks, threads, 0, stream>>>
(gradX, gradY, kH, kW, dH, dW, padH, padW, dilationH, dilationW);
THCudaCheck(cudaGetLastError());
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/pooling_kernel.h"
#else
void Encoding_(DilatedAvgPool_Forward)(THCState *state,
THCTensor *X_, THCTensor *Y_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
void Encoding_(DilatedAvgPool_Backward)(THCState *state,
THCTensor *gradX_, THCTensor *gradY_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/syncbn_kernel.c"
#else
__global__ void Encoding_(BatchNorm_Forward_kernel) (
THCDeviceTensor<real, 3> output,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> mean,
THCDeviceTensor<real, 1> invstd,
THCDeviceTensor<real, 1> gamma,
THCDeviceTensor<real, 1> beta)
{
int c = blockIdx.x;
/* main operation */
for (int b = 0; b < input.getSize(0); ++b) {
for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
real inp = input[b][c][x].ldg();
output[b][c][x] = gamma[c].ldg() * (inp - mean[c].ldg()) *
invstd[c].ldg() + beta[c].ldg();
}
}
}
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_)
/*
* batch norm forward function
* assuming the input is already flaghten
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, output_, input_, mean_, invstd_,
gamma_, beta_);
if (THCTensor_(nDimension)(state, output_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, mean_) != 1 ||
THCTensor_(nDimension)(state, invstd_) != 1 ||
THCTensor_(nDimension)(state, gamma_) != 1 ||
THCTensor_(nDimension)(state, beta_) != 1)
THError("BatchNorm2d forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> output = devicetensor<3>(state, output_);
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> mean = devicetensor<1>(state, mean_);
THCDeviceTensor<real, 1> invstd = devicetensor<1>(state, invstd_);
THCDeviceTensor<real, 1> gamma = devicetensor<1>(state, gamma_);
THCDeviceTensor<real, 1> beta = devicetensor<1>(state, beta_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(BatchNorm_Forward_kernel)<<<blocks, threads, 0, stream>>>(
output, input, mean, invstd, gamma, beta);
THCudaCheck(cudaGetLastError());
}
struct Encoding_(GradOp) {
__device__ Encoding_(GradOp)(real m, THCDeviceTensor<real, 3> i, THCDeviceTensor<real, 3> g)
: mean(m), input(i), gradOutput(g) {}
__device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
real g = gradOutput[batch][plane][n].ldg();
real c = input[batch][plane][n].ldg() - mean;
return Encoding_(Float2)(g, g * c);
}
real mean;
THCDeviceTensor<real, 3> input;
THCDeviceTensor<real, 3> gradOutput;
};
// Sum across (batch, b/c/n) applying Op() pointwise
__device__ Encoding_(Float2) Encoding_(reduce)(
Encoding_(GradOp) op,
THCDeviceTensor<real, 3> tensor,
int plane)
{
Encoding_(Float2) sum = (Encoding_(Float2))0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = Encoding_(warpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ Encoding_(Float2) shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (Encoding_(Float2))0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(warpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
__global__ void Encoding_(BatchNorm_Backward_kernel) (
THCDeviceTensor<real, 3> gradoutput,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 3> gradinput,
THCDeviceTensor<real, 1> gradgamma,
THCDeviceTensor<real, 1> gradbeta,
THCDeviceTensor<real, 1> mean,
THCDeviceTensor<real, 1> invstd,
THCDeviceTensor<real, 1> gamma,
THCDeviceTensor<real, 1> beta,
THCDeviceTensor<real, 1> gradMean,
THCDeviceTensor<real, 1> gradStd,
int train)
{
/* declarations of the variables */
/* Get the index and channels */
int c = blockIdx.x;
/* main operation */
Encoding_(GradOp) g(mean[c], input, gradoutput);
Encoding_(Float2) res = Encoding_(reduce)(g, gradoutput, c);
real gradOutputSum = res.v1;
real dotP = res.v2;
//real projScale = dotP * norm * invstd[c].ldg() * invstd[c].ldg();
real gradScale = invstd[c].ldg() * gamma[c].ldg();
if (train && threadIdx.x == 0) {
gradMean[c] = - gradOutputSum * gamma[c].ldg() * invstd[c].ldg();
gradStd[c] = - dotP * gamma[c].ldg() * invstd[c].ldg() * invstd[c].ldg();
}
if (gradinput.numElements() > 0) {
for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
gradinput[batch][c][x] = gradoutput[batch][c][x].ldg() * gradScale;
}
}
}
if (gradgamma.numElements() > 0) {
if (threadIdx.x == 0) {
gradgamma[c] += dotP * invstd[c].ldg();
}
}
if (gradbeta.numElements() > 0) {
if (threadIdx.x == 0) {
gradbeta[c] += gradOutputSum;
}
}
}
void Encoding_(BatchNorm_Backward)(THCState *state,
THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_,
THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_,
THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_,
THCTensor *gradMean_, THCTensor *gradStd_, int train)
/*
* batch norm backward function
* assuming the input is already flaghten
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 6, gradoutput_, input_, gradinput_,
gradgamma_, gradbeta_, mean_, invstd_, gamma_, beta_);
if (THCTensor_(nDimension)(state, gradoutput_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, gradinput_) != 3 ||
THCTensor_(nDimension)(state, gradgamma_) != 1 ||
THCTensor_(nDimension)(state, gradbeta_) != 1 ||
THCTensor_(nDimension)(state, mean_) != 1 ||
THCTensor_(nDimension)(state, invstd_) != 1 ||
THCTensor_(nDimension)(state, gamma_) != 1 ||
THCTensor_(nDimension)(state, beta_) != 1 ||
THCTensor_(nDimension)(state, gradMean_) != 1 ||
THCTensor_(nDimension)(state, gradStd_) != 1 )
THError("BatchNorm2d backward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> gradoutput =
devicetensor<3>(state, gradoutput_);
THCDeviceTensor<real, 3> input =
devicetensor<3>(state, input_);
THCDeviceTensor<real, 3> gradinput =
devicetensor<3>(state, gradinput_);
THCDeviceTensor<real, 1> gradgamma =
devicetensor<1>(state, gradgamma_);
THCDeviceTensor<real, 1> gradbeta = devicetensor<1>(state, gradbeta_);
THCDeviceTensor<real, 1> mean = devicetensor<1>(state, mean_);
THCDeviceTensor<real, 1> invstd = devicetensor<1>(state, invstd_);
THCDeviceTensor<real, 1> gamma = devicetensor<1>(state, gamma_);
THCDeviceTensor<real, 1> beta = devicetensor<1>(state, beta_);
THCDeviceTensor<real, 1> gradMean = devicetensor<1>(state, gradMean_);
THCDeviceTensor<real, 1> gradStd = devicetensor<1>(state, gradStd_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(BatchNorm_Backward_kernel)<<<blocks, threads, 0, stream>>>(
gradoutput, input, gradinput, gradgamma, gradbeta, mean, invstd,
gamma, beta, gradMean, gradStd, train);
THCudaCheck(cudaGetLastError());
}
struct Encoding_(SumOp) {
__device__ Encoding_(SumOp)(THCDeviceTensor<real, 3> i)
: input(i){}
__device__ __forceinline__ Encoding_(Float2) operator()(int batch, int plane, int n) {
real g = input[batch][plane][n].ldg();
return Encoding_(Float2)(g, g * g);
}
real mean;
THCDeviceTensor<real, 3> input;
};
// Sum across (batch, x/y/z) applying Op() pointwise
__device__ Encoding_(Float2) Encoding_(reduce_sum)(Encoding_(SumOp) op, THCDeviceTensor<real, 3> tensor, int plane) {
Encoding_(Float2) sum = (Encoding_(Float2))0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = Encoding_(warpSum)(sum);
// 'transpose', and reduce within warp again
__shared__ Encoding_(Float2) shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (Encoding_(Float2))0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = Encoding_(warpSum)(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
__global__ void Encoding_(Sum_Square_Forward_kernel) (
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> sum,
THCDeviceTensor<real, 1> square)
{
int c = blockIdx.x;
/* main operation */
Encoding_(SumOp) g(input);
Encoding_(Float2) res = Encoding_(reduce_sum)(g, input, c);
real xsum = res.v1;
real xsquare = res.v2;
if (threadIdx.x == 0) {
sum[c] = xsum;
square[c] = xsquare;
}
}
void Encoding_(Sum_Square_Forward)(THCState *state,
THCTensor *input_, THCTensor *sum_, THCTensor *square_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 3, input_, sum_, square_);
if (THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, sum_) != 1 ||
THCTensor_(nDimension)(state, square_) != 1)
THError("Sum_Square forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> sum = devicetensor<1>(state, sum_);
THCDeviceTensor<real, 1> square = devicetensor<1>(state, square_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(Sum_Square_Forward_kernel)<<<blocks, threads, 0, stream>>>(
input, sum, square);
THCudaCheck(cudaGetLastError());
}
__global__ void Encoding_(Sum_Square_Backward_kernel) (
THCDeviceTensor<real, 3> gradInput,
THCDeviceTensor<real, 3> input,
THCDeviceTensor<real, 1> gradSum,
THCDeviceTensor<real, 1> gradSquare)
{
int c = blockIdx.x;
/* main operation */
for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
{
gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
input[batch][c][x];
}
}
}
void Encoding_(Sum_Square_Backward)(THCState *state,
THCTensor *gradInput_, THCTensor *input_,
THCTensor *gradSum_, THCTensor *gradSquare_)
/*
*/
{
/* Check the GPU index and tensor dims*/
THCTensor_(checkGPU)(state, 4, gradInput_, input_, gradSum_,
gradSquare_);
if (THCTensor_(nDimension)(state, gradInput_) != 3 ||
THCTensor_(nDimension)(state, input_) != 3 ||
THCTensor_(nDimension)(state, gradSum_) != 1 ||
THCTensor_(nDimension)(state, gradSquare_) != 1)
THError("Sum_Square forward: incorrect input dims. \n");
/* Device tensors */
THCDeviceTensor<real, 3> gradInput = devicetensor<3>(state, gradInput_);
THCDeviceTensor<real, 3> input = devicetensor<3>(state, input_);
THCDeviceTensor<real, 1> gradSum = devicetensor<1>(state, gradSum_);
THCDeviceTensor<real, 1> gradSquare =devicetensor<1>(state, gradSquare_);
/* kernel function */
cudaStream_t stream = THCState_getCurrentStream(state);
dim3 blocks(input.getSize(1));
dim3 threads(getNumThreads(input.getSize(2)));
Encoding_(Sum_Square_Backward_kernel)<<<blocks, threads, 0, stream>>>(
gradInput, input, gradSum, gradSquare);
THCudaCheck(cudaGetLastError());
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment