Unverified Commit b872eb8c authored by Hang Zhang's avatar Hang Zhang Committed by GitHub
Browse files

ResNeSt plus (#256)

parent 5a1e3fbc
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
import os
import sys
import numpy as np
import random
import math
from tqdm import tqdm
from PIL import Image, ImageOps, ImageFilter
import torch
import torch.utils.data as data
import torchvision.transforms as transform
class Segmentation(data.Dataset):
BASE_DIR = 'cityscapes'
def __init__(self, data_folder, mode='train', transform=None,
target_transform=None):
self.root = os.path.join(data_folder, self.BASE_DIR)
self.transform = transform
self.target_transform = target_transform
self.mode = mode
self.images, self.masks = get_city_pairs(self.root, mode)
assert (len(self.images) == len(self.masks))
if len(self.images) == 0:
raise(RuntimeError("Found 0 images in subfolders of: \
" + self.root + "\n"))
def __getitem__(self, index):
img = Image.open(self.images[index]).convert('RGB')
if self.mode == 'test':
if self.transform is not None:
img = self.transform(img)
return img, os.path.basename(self.images[index])
mask = Image.open(self.masks[index])#.convert("P")
mask = np.array(mask)
mask += 1
mask[mask==256] = 0
mask = Image.fromarray(mask)
# synchrosized transform
if self.mode == 'train':
img, mask = self._sync_transform(img, mask)
elif self.mode == 'val':
img, mask = self._val_sync_transform(img, mask)
else:
raise RuntimeError('unknown mode for dataloader: {}'.format(self.mode))
# general resize, normalize and toTensor
if self.transform is not None:
#print("transform for input")
img = self.transform(img)
if self.target_transform is not None:
#print("transform for label")
mask = self.target_transform(mask)
return img, mask
def __len__(self):
return len(self.images)
def _val_sync_transform(self, img, mask):
"""
synchronized transformation
"""
outsize = 720
short = outsize
w, h = img.size
if w > h:
oh = short
ow = int(1.0 * w * oh / h)
else:
ow = short
oh = int(1.0 * h * ow / w)
img = img.resize((ow, oh), Image.BILINEAR)
mask = mask.resize((ow, oh), Image.NEAREST)
# center crop
w, h = img.size
x1 = int(round((w - outsize) / 2.))
y1 = int(round((h - outsize) / 2.))
img = img.crop((x1, y1, x1+outsize, y1+outsize))
mask = mask.crop((x1, y1, x1+outsize, y1+outsize))
return img, mask
def _sync_transform(self, img, mask):
# random mirror
if random.random() < 0.5:
img = img.transpose(Image.FLIP_LEFT_RIGHT)
mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
base_size = 2048
crop_size = 720
# random scale (short edge from 480 to 720)
long_size = random.randint(int(base_size*0.5), int(base_size*2.0))
w, h = img.size
if h > w:
oh = long_size
ow = int(1.0 * w * oh / h)
short_size = ow
else:
ow = long_size
oh = int(1.0 * h * ow / w)
short_size = oh
img = img.resize((ow, oh), Image.BILINEAR)
mask = mask.resize((ow, oh), Image.NEAREST)
# random rotate -10~10, mask using NN rotate
deg = random.uniform(-10,10)
img = img.rotate(deg, resample=Image.BILINEAR)
mask = mask.rotate(deg, resample=Image.NEAREST)
# pad crop
if short_size < crop_size:
padh = crop_size - oh if oh < crop_size else 0
padw = crop_size - ow if ow < crop_size else 0
img = ImageOps.expand(img, border=(0,0,padw,padh), fill=0)
mask = ImageOps.expand(mask, border=(0,0,padw,padh), fill=0)
# random crop 480
w, h = img.size
x1 = random.randint(0, w - crop_size)
y1 = random.randint(0, h - crop_size)
img = img.crop((x1, y1, x1+crop_size, y1+crop_size))
mask = mask.crop((x1, y1, x1+crop_size, y1+crop_size))
# gaussian blur as in PSP ?
if random.random() < 0.5:
img = img.filter(ImageFilter.GaussianBlur(
radius=random.random()))
return img, mask
def get_city_pairs(folder, mode='train'):
img_paths = []
mask_paths = []
if mode=='train':
img_folder = os.path.join(folder, 'leftImg8bit/train_extra')
mask_folder = os.path.join(folder, 'gtCoarse/train_extra')
else:
img_folder = os.path.join(folder, 'leftImg8bit/val')
mask_folder = os.path.join(folder, 'gtFine/val')
for root, directories, files in os.walk(img_folder):
for filename in files:
basename, extension =os.path.splitext(filename)
if filename.endswith(".png"):
imgpath = os.path.join(root, filename)
foldername = os.path.basename(os.path.dirname(imgpath))
maskname = filename.replace('leftImg8bit','gtCoarse_trainIds')
maskpath = os.path.join(mask_folder, foldername, maskname)
if os.path.isfile(imgpath) and os.path.isfile(maskpath):
img_paths.append(imgpath)
mask_paths.append(maskpath)
else:
print('cannot find the mask or image:', imgpath, maskpath)
return img_paths, mask_paths
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
import os
import sys
import numpy as np
import random
import math
import torch.utils.data as data
from PIL import Image, ImageOps
import torch.utils.data as data
import torchvision.transforms as transform
from .dataset import ToLabel
class FolderLoader(data.Dataset):
def __init__(self, root, transform=None):
self.root = root
self.transform = transform
self.images = get_folder_images(root)
if len(self.images) == 0:
raise(RuntimeError("Found 0 images in subfolders of: \
" + self.root + "\n"))
def __getitem__(self, index):
img = Image.open(self.images[index]).convert('RGB')
if self.transform is not None:
img = self.transform(img)
return img, os.path.basename(self.images[index])
def __len__(self):
return len(self.images)
def get_folder_images(img_folder):
img_paths = []
for filename in os.listdir(img_folder):
if filename.endswith(".jpg"):
imgpath = os.path.join(img_folder, filename)
img_paths.append(imgpath)
return img_paths
class Dataloder():
def __init__(self, args):
# the data augmentation is implemented as part of the dataloader
assert(args.test)
input_transform = transform.Compose([
transform.ToTensor(),
transform.Normalize(args.mean, args.std)])
args.test_batch_size = 1
assert(args.test_folder is not None)
print('loading the data from: {}'.format(args.test_folder))
testset = FolderLoader(args.test_folder, input_transform)
kwargs = {'num_workers': args.workers, 'pin_memory': True} \
if args.cuda else {}
self.trainloader = None
self.testloader = data.DataLoader(testset,
batch_size=args.test_batch_size,
shuffle=False, **kwargs)
def getloader(self):
return self.trainloader, self.testloader
# created by: Sean Liu
# Amazon Lab 126
from __future__ import print_function
import errno
import hashlib
import os
import sys
import tarfile
import numpy as np
import random
import math
import torch.utils.data as data
import PIL
from PIL import Image, ImageOps
from six.moves import urllib
class Segmentation_HPW18(data.Dataset):
CLASSES = [
'background', 'hat', 'hair', 'sunglasses', 'upper-clothes',
'skirt', 'pants', 'dress', 'belt', 'left-shoe', 'right-shoe',
'face', 'left-leg', 'right-leg', 'left-arm', 'right-arm', 'bag',
'scarf'
]
URL = "/cvdata1/lliuqian/humanParsingDataset"
FILE = "hpw18.tar.gz"
MD5 = ''
BASE_DIR = ''
def __init__(self,
root,
train=True,
transform=None,
target_transform=None,
download=False):
self.root = root
_hpw18_root = os.path.join(self.root, self.BASE_DIR)
_mask_dir = os.path.join(_hpw18_root, 'SegmentationClassAug_256x384')
_image_dir = os.path.join(_hpw18_root, 'JPEGImages_256x384')
self.transform = transform
self.target_transform = target_transform
self.train = train
if download:
self._download()
# train/val/test splits are pre-cut
_splits_dir = _hpw18_root
_split_f = os.path.join(_splits_dir, 'humanparsingImageMask_256x384_absPath_train.txt')
if not self.train:
_split_f = os.path.join(_splits_dir, 'humanparsingImageMask_256x384_absPath_val.txt')
print("reading from ", _split_f)
self.images = []
self.masks = []
with open(os.path.join(_split_f), "r") as lines:
for line in lines:
s = line.split()
_image = s[0] # image absolution path
_mask = s[1] # mask absolution path
assert os.path.isfile(_image)
assert os.path.isfile(_mask)
self.images.append(_image)
self.masks.append(_mask)
assert (len(self.images) == len(self.masks))
def __getitem__(self, index):
_img = Image.open(self.images[index]).convert('RGB')
_timg = Image.open(self.masks[index])
_target = np.array(_timg, dtype=np.uint8)
_target = Image.fromarray(_target)
# synchrosized transform
if self.train:
_img, _target = self._sync_transform( _img, _target)
# general resize, normalize and toTensor
if self.transform is not None:
_img = self.transform(_img)
if self.target_transform is not None:
_target = self.target_transform(_target)
return _img, _target
def __len__(self):
return len(self.images)
def _sync_transform(self, img, mask):
# random rotate -10~10
deg = random.uniform(-10,10)
img = img.rotate(deg)
mask = mask.rotate(deg, PIL.Image.NEAREST)
return img, mask
if __name__ == '__main__':
hpw18 = Segmentation_HPW18('/cvdata1/lliuqian/', train=True)
print(hpw18[0])
print (len(hpw18))
...@@ -11,6 +11,9 @@ import os ...@@ -11,6 +11,9 @@ import os
import torchvision.transforms as transforms import torchvision.transforms as transforms
import torchvision.datasets as datasets import torchvision.datasets as datasets
import warnings
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
class ImageNetDataset(datasets.ImageFolder): class ImageNetDataset(datasets.ImageFolder):
BASE_DIR = "ILSVRC2012" BASE_DIR = "ILSVRC2012"
def __init__(self, root=os.path.expanduser('~/.encoding/data'), transform=None, def __init__(self, root=os.path.expanduser('~/.encoding/data'), transform=None,
......
"""Encoding Autograd Fuctions""" """Encoding Autograd Fuctions"""
from .encoding import * from .encoding import *
from .syncbn import * from .syncbn import *
from .dist_syncbn import dist_syncbatchnorm
from .customize import * from .customize import *
from .rectify import *
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## Email: zhanghang0704@gmail.com
## Copyright (c) 2020
##
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import torch
from torch.autograd.function import Function
from .. import lib
__all__ = ['dist_syncbatchnorm']
class dist_syncbatchnorm_(Function):
@staticmethod
def forward(ctx, x, gamma, beta, running_mean, running_var, eps, momentum, training, process_group):
x = x.contiguous()
ctx.training = training
ctx.momentum = momentum
ctx.eps = eps
ctx.process_group = process_group
if not ctx.training:
_ex, _var = running_mean.contiguous(), running_var.contiguous()
_exs = _var + _ex ** 2
if x.is_cuda:
y = lib.gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
else:
y = lib.cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
ctx.save_for_backward(x, _ex, _exs, gamma, beta)
return y
size = x.numel() // x.size(1)
if size == 1:
raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
if x.is_cuda:
_ex, _exs = lib.gpu.expectation_forward(x)
else:
raise NotImplemented
count = torch.Tensor([1]).to(x.device)
count_all_reduce = torch.distributed.all_reduce(count, group=process_group, async_op=True)
_ex_all_reduce = torch.distributed.all_reduce(_ex, group=process_group, async_op=True)
_exs_all_reduce = torch.distributed.all_reduce(_exs, group=process_group, async_op=True)
count_all_reduce.wait()
_ex_all_reduce.wait()
_exs_all_reduce.wait()
_ex = _ex / count
_exs = _exs / count
# Update running stats
_var = _exs - _ex ** 2
running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * _ex)
running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * _var)
# Mark in-place modified tensors
ctx.mark_dirty(running_mean, running_var)
# BN forward + activation
if x.is_cuda:
y = lib.gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
else:
y = lib.cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
ctx.save_for_backward(x, _ex, _exs, gamma, beta)
return y
@staticmethod
def backward(ctx, dz):
x, _ex, _exs, gamma, beta = ctx.saved_tensors
dz = dz.contiguous()
# BN backward
if dz.is_cuda:
dx, _dex, _dexs, dgamma, dbeta = \
lib.gpu.batchnorm_backward(dz, x, _ex, _exs, gamma, beta, ctx.eps)
else:
raise NotImplemented
if ctx.training:
process_group = ctx.process_group
count = torch.Tensor([1]).to(x.device)
count_all_reduce = torch.distributed.all_reduce(count, group=process_group, async_op=True)
_dex_all_reduce = torch.distributed.all_reduce(_dex, group=process_group, async_op=True)
_dexs_all_reduce = torch.distributed.all_reduce(_dexs, group=process_group, async_op=True)
count_all_reduce.wait()
_dex_all_reduce.wait()
_dexs_all_reduce.wait()
_dex = _dex / count
_dexs = _dexs / count
if x.is_cuda:
dx_ = lib.gpu.expectation_backward(x, _dex, _dexs)
else:
raise NotImplemented
dx = dx + dx_
return dx, dgamma, dbeta, None, None, None, None, None, None
dist_syncbatchnorm = dist_syncbatchnorm_.apply
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## Email: zhanghang0704@gmail.com
## Copyright (c) 2020
##
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Rectify function"""
import torch
from torch.autograd import Function
from .. import lib
__all__ = ['rectify']
class _rectify(Function):
@staticmethod
def forward(ctx, y, x, kernel_size, stride, padding, dilation, average):
ctx.save_for_backward(x)
# assuming kernel_size is 3
kernel_size = [k + 2 * (d - 1) for k,d in zip(kernel_size, dilation)]
ctx.kernel_size = kernel_size
ctx.stride = stride
ctx.padding = padding
ctx.dilation = dilation
ctx.average = average
if x.is_cuda:
lib.gpu.conv_rectify(y, x, kernel_size, stride, padding, dilation, average)
else:
lib.cpu.conv_rectify(y, x, kernel_size, stride, padding, dilation, average)
ctx.mark_dirty(y)
return y
@staticmethod
def backward(ctx, grad_y):
x, = ctx.saved_variables
if x.is_cuda:
lib.gpu.conv_rectify(grad_y, x, ctx.kernel_size, ctx.stride,
ctx.padding, ctx.dilation, ctx.average)
else:
lib.cpu.conv_rectify(grad_y, x, ctx.kernel_size, ctx.stride,
ctx.padding, ctx.dilation, ctx.average)
ctx.mark_dirty(grad_y)
return grad_y, None, None, None, None, None, None
rectify = _rectify.apply
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
"""Synchronized Cross-GPU Batch Normalization functions""" """Synchronized Cross-GPU Batch Normalization functions"""
import torch import torch
import torch.cuda.comm as comm import torch.cuda.comm as comm
from torch.autograd import Variable, Function from torch.autograd import Function
from torch.autograd.function import once_differentiable from torch.autograd.function import once_differentiable
from .. import lib from .. import lib
......
...@@ -12,6 +12,7 @@ cpu = load('enclib_cpu', [ ...@@ -12,6 +12,7 @@ cpu = load('enclib_cpu', [
os.path.join(cpu_path, 'syncbn_cpu.cpp'), os.path.join(cpu_path, 'syncbn_cpu.cpp'),
os.path.join(cpu_path, 'roi_align_cpu.cpp'), os.path.join(cpu_path, 'roi_align_cpu.cpp'),
os.path.join(cpu_path, 'nms_cpu.cpp'), os.path.join(cpu_path, 'nms_cpu.cpp'),
os.path.join(cpu_path, 'rectify_cpu.cpp'),
], build_directory=cpu_path, verbose=False) ], build_directory=cpu_path, verbose=False)
if torch.cuda.is_available(): if torch.cuda.is_available():
...@@ -19,9 +20,9 @@ if torch.cuda.is_available(): ...@@ -19,9 +20,9 @@ if torch.cuda.is_available():
os.path.join(gpu_path, 'operator.cpp'), os.path.join(gpu_path, 'operator.cpp'),
os.path.join(gpu_path, 'activation_kernel.cu'), os.path.join(gpu_path, 'activation_kernel.cu'),
os.path.join(gpu_path, 'encoding_kernel.cu'), os.path.join(gpu_path, 'encoding_kernel.cu'),
os.path.join(gpu_path, 'encodingv2_kernel.cu'),
os.path.join(gpu_path, 'syncbn_kernel.cu'), os.path.join(gpu_path, 'syncbn_kernel.cu'),
os.path.join(gpu_path, 'roi_align_kernel.cu'), os.path.join(gpu_path, 'roi_align_kernel.cu'),
os.path.join(gpu_path, 'nms_kernel.cu'), os.path.join(gpu_path, 'nms_kernel.cu'),
os.path.join(gpu_path, 'rectify_cuda.cu'),
], extra_cuda_cflags=["--expt-extended-lambda"], ], extra_cuda_cflags=["--expt-extended-lambda"],
build_directory=gpu_path, verbose=False) build_directory=gpu_path, verbose=False)
...@@ -12,4 +12,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ...@@ -12,4 +12,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("sumsquare_forward", &Sum_Square_Forward_CPU, "SumSqu forward (CPU)"); m.def("sumsquare_forward", &Sum_Square_Forward_CPU, "SumSqu forward (CPU)");
m.def("sumsquare_backward", &Sum_Square_Backward_CPU, "SumSqu backward (CPU)"); m.def("sumsquare_backward", &Sum_Square_Backward_CPU, "SumSqu backward (CPU)");
m.def("non_max_suppression", &Non_Max_Suppression_CPU, "NMS (CPU)"); m.def("non_max_suppression", &Non_Max_Suppression_CPU, "NMS (CPU)");
m.def("conv_rectify", &CONV_RECTIFY_CPU, "Convolution Rectifier (CPU)");
} }
...@@ -72,3 +72,12 @@ std::vector<at::Tensor> Non_Max_Suppression_CPU( ...@@ -72,3 +72,12 @@ std::vector<at::Tensor> Non_Max_Suppression_CPU(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& scores, const at::Tensor& scores,
double thresh); double thresh);
void CONV_RECTIFY_CPU(
at::Tensor& output,
const at::Tensor& input,
at::IntArrayRef kernel_size,
at::IntArrayRef stride,
at::IntArrayRef padding,
at::IntArrayRef dilation,
bool avg_mode);
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Parallel.h>
#include <tuple>
#include <torch/extension.h>
#include <ATen/div_rtn.h>
#include <ATen/TensorUtils.h>
#include <ATen/AccumulateType.h>
template <typename dest_t, typename src_t>
static inline dest_t safe_downcast(src_t v)
{
TORCH_CHECK(std::numeric_limits<dest_t>::min() <= v && v <= std::numeric_limits<dest_t>::max(),
"integer out of range");
return static_cast<dest_t>(v);
}
template<typename T>
static inline T pooling_output_shape_pad_lr(
T inputSize, T kernelSize, T pad_l, T pad_r, T stride, T dilation,
bool ceil_mode) {
T outputSize = div_rtn<T>(
inputSize + pad_l + pad_r - dilation * (kernelSize - 1) - 1 +
(ceil_mode ? stride - 1 : 0), stride) + 1;
if (pad_l) {
// ensure that the last pooling starts inside the image
// needed to avoid problems in ceil mode
if ((outputSize - 1) * stride >= inputSize + pad_l)
--outputSize;
}
return outputSize;
}
template<typename T>
static inline T pooling_output_shape(
T inputSize, T kernelSize, T pad, T stride, T dilation, bool ceil_mode) {
return pooling_output_shape_pad_lr(
inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode);
}
static inline void pool2d_shape_check(
const at::Tensor& input,
int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW,
int64_t nInputPlane,
int64_t inputHeight, int64_t inputWidth,
int64_t outputHeight, int64_t outputWidth)
{
const int64_t ndim = input.ndimension();
const int64_t nOutputPlane = nInputPlane;
TORCH_CHECK(kW > 0 && kH > 0,
"kernel size should be greater than zero, but got ",
"kH: ", kH, " kW: ", kW);
TORCH_CHECK(dW > 0 && dH > 0,
"stride should be greater than zero, but got "
"dH: ", dH, " dW: ", dW);
TORCH_CHECK(dilationH > 0 && dilationW > 0,
"dilation should be greater than zero, but got ",
"dilationH: ", dilationH, " dilationW: ", dilationW);
TORCH_CHECK(input.numel() > 0 && (ndim == 3 || ndim == 4),
"non-empty 3D or 4D input tensor expected but got ndim: ", ndim);
//TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
// "pad should be smaller than half of kernel size, but got ",
// "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1,
"Given input size: (",
nInputPlane, "x", inputHeight, "x", inputWidth, "). ",
"Calculated output size: (",
nOutputPlane, "x", outputHeight, "x", outputWidth, "). ",
"Output size is too small");
}
template <typename scalar_t>
static void conv_rectify_cpu_frame(
scalar_t *output_data,
int64_t nbatch,
int64_t nInputPlane,
int64_t inputWidth,
int64_t inputHeight,
int64_t outputWidth,
int64_t outputHeight,
int kW,
int kH,
int dW,
int dH,
int padW,
int padH,
const int dilation_h,
const int dilation_w,
bool average_mode) {
//at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
for (int64_t k = 0; k < nInputPlane; k++) {
int64_t p;
for(p = 0; p < nbatch; p++)
{
int64_t xx, yy;
/* For all output pixels... */
scalar_t *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
//int64_t i;
for(yy = 0; yy < outputHeight; yy++)
{
for(xx = 0; xx < outputWidth; xx++)
{
/* Compute the mean of the input image... */
int64_t hstart = yy * dH - padH;
int64_t wstart = xx * dW - padW;
int64_t hend = std::min(hstart + kH, inputHeight + padH);
int64_t wend = std::min(wstart + kW, inputWidth + padW);
//int pool_size = (hend - hstart) * (wend - wstart);
int pool_size = ((kH - 1) / dilation_h + 1) * ((kW - 1) / dilation_w + 1);
hstart = std::max(hstart, (int64_t) 0);
wstart = std::max(wstart, (int64_t) 0);
hend = std::min(hend, inputHeight);
wend = std::min(wend, inputWidth);
int hcount = int(((hend - hstart) - 1) / dilation_h + 1);
int wcount = int(((wend - wstart) - 1) / dilation_w + 1);
scalar_t mul_factor;
if (average_mode) {
mul_factor = scalar_t(1.0) / (hcount * wcount);
}
else {
mul_factor = scalar_t(1.0) * pool_size / (hcount * wcount);
}
*ptr_output++ *= mul_factor;
}
}
}
}
//});
}
void conv_rectify_cpu_tempalte(
at::Tensor &output,
const at::Tensor &input_,
at::IntArrayRef kernel_size,
at::IntArrayRef stride,
at::IntArrayRef padding,
at::IntArrayRef dilation,
bool average_mode)
{
// #20866, #22032: Guarantee this for the official C++ API?
TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
"conv_rectify: kernel_size must either be a single int, or a tuple of two ints");
const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
"conv_rectify: stride must either be omitted, a single int, or a tuple of two ints");
const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
const int dW = stride.empty() ? kW :
stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
"conv_rectify: padding must either be a single int, or a tuple of two ints");
const int padH = safe_downcast<int, int64_t>(padding[0]);
const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
"rectify: dilation must either be a single int, or a tuple of two ints");
const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
"non-empty 2D or 3D (batch mode) tensor expected for input");
/* sizes */
const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
const int64_t nInputPlane = input_.size(-3);
const int64_t inputHeight = input_.size(-2);
const int64_t inputWidth = input_.size(-1);
//const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, false);
//const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, false);
const int64_t outputHeight = output.size(-2);
const int64_t outputWidth = output.size(-1);
pool2d_shape_check(
input_,
kH, kW, dH, dW, padH, padW, dilationH, dilationW,
nInputPlane,
inputHeight, inputWidth,
outputHeight, outputWidth);
TORCH_CHECK(output.is_contiguous(), "conv_rectify: output must be contiguous");
at::Tensor input = input_.contiguous();
AT_DISPATCH_FLOATING_TYPES(input.type(), "conv_rectify_cuda_frame", ([&] {
scalar_t *output_data = output.data_ptr<scalar_t>();
conv_rectify_cpu_frame<scalar_t>(
output_data,
nbatch,
nInputPlane,
inputWidth, inputHeight,
outputWidth, outputHeight,
kW, kH,
dW, dH,
padW, padH,
dilationH,
dilationW,
average_mode);
}
));
}
void CONV_RECTIFY_CPU(
at::Tensor& output,
const at::Tensor& input,
at::IntArrayRef kernel_size,
at::IntArrayRef stride,
at::IntArrayRef padding,
at::IntArrayRef dilation,
bool average) {
//at::Tensor output = at::empty({0}, input.options());
conv_rectify_cpu_tempalte(
output,
input,
kernel_size,
stride,
padding,
dilation,
average);
}
#include <vector>
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/ATen.h> #include <ATen/ATen.h>
// #include <ATen/cuda/CUDAContext.h> #include <vector>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <thrust/device_ptr.h> #include <thrust/device_ptr.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include "common.h"
namespace { namespace {
......
#include <vector>
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/Functions.h>
#include <ATen/cuda/CUDAContext.h>
#include "common.h"
#include "device_tensor.h"
namespace {
template<typename DType, typename Acctype>
struct KD2Op {
__device__ KD2Op(DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : X(x), C(c), STD(std) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d)
{
DType r = (X[b][i][d] - C[k][d]) / STD[k][d];
return ScalarConvert<DType, Acctype>::to(r * r);
}
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
};
template<typename DType, typename Acctype>
__global__ void Encoding_Dist_Forward_kernel (
DeviceTensor<DType, 3> KD,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD) {
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x;
i = blockIdx.y;
D = X.getSize(2);
/* main operation */
KD2Op<DType, Acctype> g(X, C, STD);
KD[b][i][k] = reduceD<Acctype>(g, b, i, k, D);;
}
template<typename DType, typename Acctype>
struct EncGradXOp {
__device__ EncGradXOp(
DeviceTensor<DType, 3> gkd,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : GKD(gkd), X(x), C(c), STD(std) {}
// DeviceTensor<DType, 1> s, S(s)
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(
2 * GKD[b][i][k] * (X[b][i][d] - C[k][d]) /
(STD[k][d] * STD[k][d]));
}
DeviceTensor<DType, 3> GKD;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
// DeviceTensor<DType, 1> S;
};
template<typename DType, typename Acctype>
__global__ void Encoding_GradX_kernel (
DeviceTensor<DType, 3> GKD,
DeviceTensor<DType, 3> GX,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD) {
// DeviceTensor<DType, 1> S
/* declarations of the variables */
int b, d, i, K;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y;
d = blockIdx.x;
K = C.getSize(0);
/* main operation */
EncGradXOp<DType, Acctype> g(GKD, X, C, STD);
GX[b][i][d] = reduceK<Acctype>(g, b, i, d, K);
}
template<typename DType, typename Acctype>
struct EncGradSTDOp {
__device__ EncGradSTDOp(
DeviceTensor<DType, 3> gkd,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : GKD(gkd), X(x), C(c), STD(std) {}
// DeviceTensor<DType, 1> s, S(s)
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(
-2 * GKD[b][i][k] * (X[b][i][d] - C[k][d]) *
(X[b][i][d] - C[k][d]) / (STD[k][d] * STD[k][d] * STD[k][d]));
}
DeviceTensor<DType, 3> GKD;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
// DeviceTensor<DType, 1> S;
};
template<typename DType, typename Acctype>
__global__ void Encoding_GradCSTD_kernel (
DeviceTensor<DType, 3> GKD,
DeviceTensor<DType, 2> GC,
DeviceTensor<DType, 2> GSTD,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD) {
/* declarations of the variables */
int k, d, B, N;
/* Get the index and channels */
d = blockIdx.x;
k = blockIdx.y;
B = X.getSize(0);
N = X.getSize(1);
/* main operation */
EncGradXOp<DType, Acctype> g1(GKD, X, C, STD);
EncGradSTDOp<DType, Acctype> g2(GKD, X, C, STD);
GC[k][d] = -reduceBN<Acctype>(g1, k, d, B, N);
GSTD[k][d] += reduceBN<Acctype>(g2, k, d, B, N);
}
template<typename DType, typename Acctype>
struct EncGradSTDXOp {
__device__ EncGradSTDXOp(
DeviceTensor<DType, 2> gstd,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : GSTD(gstd), X(x), C(c), STD(std) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(
GSTD[k][d] * (X[b][i][d] - C[k][d]) / STD[k][d]);
}
DeviceTensor<DType, 2> GSTD;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
};
template<typename DType, typename Acctype>
__global__ void Encoding_GradSTDX_kernel (
DeviceTensor<DType, 2> GSTD,
DeviceTensor<DType, 3> GX,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD,
int N) {
/* declarations of the variables */
int b, d, i, K;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y;
d = blockIdx.x;
K = C.getSize(0);
/* main operation */
EncGradSTDXOp<DType, Acctype> g(GSTD, X, C, STD);
GX[b][i][d] += reduceK<Acctype>(g, b, i, d, K) / N;
}
template<typename DType, typename Acctype>
struct AggOpV2 {
__device__ AggOpV2(DeviceTensor<DType, 3> a,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : A(a), X(x), C(c), STD(std) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(A[b][i][k] * (X[b][i][d] - C[k][d]) /
STD[k][d]);
}
DeviceTensor<DType, 3> A;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
};
template<typename DType, typename Acctype>
__global__ void AggregateV2_Forward_kernel (
DeviceTensor<DType, 3> E,
DeviceTensor<DType, 3> A,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD) {
/* declarations of the variables */
int b, k, d, N;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x;
k = blockIdx.y;
N = X.getSize(1);
/* main operation */
AggOpV2<DType, Acctype> g(A, X, C, STD);
E[b][k][d] = reduceN<Acctype>(g, b, k, d, N);
}
template<typename DType, typename Acctype>
struct AggV2BackOp {
__device__ AggV2BackOp(DeviceTensor<DType, 3> g,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : G(g), X(x), C(c), STD(std) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(G[b][k][d] * (X[b][i][d] - C[k][d]) /
STD[k][d]);
}
DeviceTensor<DType, 3> G;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
};
template<typename DType, typename Acctype>
__global__ void AggregateV2_Backward_kernel (
DeviceTensor<DType, 3> GA,
DeviceTensor<DType, 3> GE,
DeviceTensor<DType, 3> A,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD) {
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y;
k = blockIdx.x;
D = GE.getSize(2);
/* main operation */
AggV2BackOp<DType, Acctype> g(GE, X, C, STD);
GA[b][i][k] = reduceD<Acctype>(g, b, i, k, D);
}
} // namespace
at::Tensor Encoding_Dist_Inference_Forward_CUDA(
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_) {
// const at::Tensor S_,
// X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
auto KD_ = torch::zeros({X_.size(0), X_.size(1), C_.size(0)}, X_.options());
// E(x), E(x^2)
int N = X_.size(0) * X_.size(1);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1)));
// calculate the kernel distance
AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Inference_Forward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> KD = devicetensor<scalar_t, 3>(KD_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
/* kernel function */
Encoding_Dist_Forward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>> (KD, X, C, STD);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return KD_;
}
std::vector<at::Tensor> Encoding_Dist_Inference_Backward_CUDA(
const at::Tensor GKD_,
const at::Tensor KD_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_) {
auto GX_ = at::zeros_like(X_);
auto GC_ = at::zeros_like(C_);
auto GSTD_ = at::zeros_like(STD_);
/* kernel function */
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
dim3 threads1(getNumThreads(C_.size(0)));
dim3 blocks2(C_.size(1), C_.size(0));
dim3 threads2(getNumThreads(X_.size(1)));
int N = X_.size(0) * X_.size(1);
AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> GKD = devicetensor<scalar_t, 3>(GKD_);
DeviceTensor<scalar_t, 2> GSTD = devicetensor<scalar_t, 2>(GSTD_);
DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
DeviceTensor<scalar_t, 2> GC = devicetensor<scalar_t, 2>(GC_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
Encoding_GradX_kernel<scalar_t, scalar_t>
<<<blocks1, threads1, 0, stream>>> (GKD, GX, X, C, STD);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
Encoding_GradCSTD_kernel<scalar_t, scalar_t>
<<<blocks2, threads2, 0, stream>>> (GKD, GC, GSTD, X, C, STD);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
}));
return {GX_, GC_, GSTD_};
}
std::vector<at::Tensor> Encoding_Dist_Forward_CUDA(
const at::Tensor X_,
const at::Tensor C_,
double eps) {
// const at::Tensor S_,
// X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
auto KD_ = torch::zeros({X_.size(0), X_.size(1), C_.size(0)}, X_.options());
// E(x), E(x^2)
int N = X_.size(0) * X_.size(1);
auto SVar_ = (X_.pow(2).sum(0).sum(0).view({1, X_.size(2)}) -
2 * C_ * X_.sum(0).sum(0).view({1, X_.size(2)})).expand_as(C_) +
C_.pow(2) * N;
auto STD_ = at::sqrt(SVar_ / N + eps);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1)));
// calculate the kernel distance
AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Forward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> KD = devicetensor<scalar_t, 3>(KD_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
/* kernel function */
Encoding_Dist_Forward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>> (KD, X, C, STD);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return {KD_, STD_, SVar_ / (N - 1)};
}
std::vector<at::Tensor> Encoding_Dist_Backward_CUDA(
const at::Tensor GKD_,
const at::Tensor GSTD_,
const at::Tensor KD_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_) {
auto GX_ = at::zeros_like(X_);
auto GC_ = at::zeros_like(C_);
auto GSTD2_ = GSTD_.clone();
/* kernel function */
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
dim3 threads1(getNumThreads(C_.size(0)));
dim3 blocks2(C_.size(1), C_.size(0));
dim3 threads2(getNumThreads(X_.size(1)));
int N = X_.size(0) * X_.size(1);
AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> GKD = devicetensor<scalar_t, 3>(GKD_);
DeviceTensor<scalar_t, 2> GSTD = devicetensor<scalar_t, 2>(GSTD2_);
DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
DeviceTensor<scalar_t, 2> GC = devicetensor<scalar_t, 2>(GC_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
Encoding_GradX_kernel<scalar_t, scalar_t>
<<<blocks1, threads1, 0, stream>>> (GKD, GX, X, C, STD);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
Encoding_GradCSTD_kernel<scalar_t, scalar_t>
<<<blocks2, threads2, 0, stream>>> (GKD, GC, GSTD, X, C, STD);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
Encoding_GradSTDX_kernel<scalar_t, scalar_t>
<<<blocks1, threads1, 0, stream>>> (GSTD, GX, X, C, STD, N);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
}));
// d_sigma/d_c
GC_ = GC_ - GSTD2_ * (X_.mean(0).mean(0) - C_) / STD_;
return {GX_, GC_};
}
at::Tensor AggregateV2_Forward_CUDA(
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_) {
/* Device tensors */
auto E_ = torch::zeros({A_.size(0), C_.size(0), C_.size(1)}, A_.options());
// auto IS_ = 1.0f / (S_ + eps).sqrt();
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// B, K, D
dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
dim3 threads(getNumThreads(X_.size(1)));
AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Forward_CUDA", ([&] {
DeviceTensor<scalar_t, 3> E = devicetensor<scalar_t, 3>(E_);
DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
/* kernel function */
AggregateV2_Forward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>>(E, A, X, C, STD);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return E_;
}
std::vector<at::Tensor> AggregateV2_Backward_CUDA(
const at::Tensor GE_,
const at::Tensor E_,
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_) {
auto gradA_ = at::zeros_like(A_);
auto gradX_ = at::bmm(A_ , (GE_ / STD_.unsqueeze(0)));
auto gradC_ = -(A_.sum(1).unsqueeze(2) * GE_ / STD_.unsqueeze(0)).sum(0);
auto gradSTD_ = -(GE_ * E_).sum(0) / STD_;
// auto gradS_ = -0.5 * (GE_ * E_).sum(2).sum(0) / S_;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// B, K, D
dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1)));
AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> GA = devicetensor<scalar_t, 3>(gradA_);
DeviceTensor<scalar_t, 3> GE = devicetensor<scalar_t, 3>(GE_);
DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
AggregateV2_Backward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>> (GA, GE, A, X, C, STD);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return {gradA_, gradX_, gradC_, gradSTD_};
}
...@@ -16,14 +16,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ...@@ -16,14 +16,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("expectation_backward", &Expectation_Backward_CUDA, "Expectation backward (CUDA)"); m.def("expectation_backward", &Expectation_Backward_CUDA, "Expectation backward (CUDA)");
m.def("expectation_inp_backward", &Expectation_Inp_Backward_CUDA, m.def("expectation_inp_backward", &Expectation_Inp_Backward_CUDA,
"Inplace Expectation backward (CUDA)"); "Inplace Expectation backward (CUDA)");
m.def("encoding_dist_forward", &Encoding_Dist_Forward_CUDA, "EncDist forward (CUDA)");
m.def("encoding_dist_backward", &Encoding_Dist_Backward_CUDA, "Assign backward (CUDA)");
m.def("encoding_dist_inference_forward", &Encoding_Dist_Inference_Forward_CUDA,
"EncDist Inference forward (CUDA)");
m.def("encoding_dist_inference_backward", &Encoding_Dist_Inference_Backward_CUDA,
"Assign Inference backward (CUDA)");
m.def("aggregatev2_forward", &AggregateV2_Forward_CUDA, "AggregateV2 forward (CUDA)");
m.def("aggregatev2_backward", &AggregateV2_Backward_CUDA, "AggregateV2 backward (CUDA)");
m.def("leaky_relu_forward", &LeakyRelu_Forward_CUDA, "Learky ReLU forward (CUDA)"); m.def("leaky_relu_forward", &LeakyRelu_Forward_CUDA, "Learky ReLU forward (CUDA)");
m.def("leaky_relu_backward", &LeakyRelu_Backward_CUDA, "Learky ReLU backward (CUDA)"); m.def("leaky_relu_backward", &LeakyRelu_Backward_CUDA, "Learky ReLU backward (CUDA)");
m.def("conv_rectify", &CONV_RECTIFY_CUDA, "Convolution Rectifier (CUDA)");
} }
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/ATen.h>
#include <vector> #include <vector>
at::Tensor ROIAlign_Forward_CUDA( at::Tensor ROIAlign_Forward_CUDA(
...@@ -102,45 +103,15 @@ at::Tensor Expectation_Inp_Backward_CUDA( ...@@ -102,45 +103,15 @@ at::Tensor Expectation_Inp_Backward_CUDA(
const at::Tensor beta_, const at::Tensor beta_,
float eps); float eps);
at::Tensor Encoding_Dist_Inference_Forward_CUDA(
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_);
std::vector<at::Tensor> Encoding_Dist_Inference_Backward_CUDA(
const at::Tensor GKD_,
const at::Tensor KD_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_);
std::vector<at::Tensor> Encoding_Dist_Forward_CUDA(
const at::Tensor X,
const at::Tensor C,
double eps);
std::vector<at::Tensor> Encoding_Dist_Backward_CUDA(
const at::Tensor GKD_,
const at::Tensor GSTD_,
const at::Tensor KD_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_);
at::Tensor AggregateV2_Forward_CUDA(
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_);
std::vector<at::Tensor> AggregateV2_Backward_CUDA(
const at::Tensor GE_,
const at::Tensor E_,
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_);
void LeakyRelu_Forward_CUDA(at::Tensor z, float slope); void LeakyRelu_Forward_CUDA(at::Tensor z, float slope);
void LeakyRelu_Backward_CUDA(at::Tensor z, at::Tensor dz, float slope); void LeakyRelu_Backward_CUDA(at::Tensor z, at::Tensor dz, float slope);
void CONV_RECTIFY_CUDA(
at::Tensor& output,
const at::Tensor& input,
at::IntArrayRef kernel_size,
at::IntArrayRef stride,
at::IntArrayRef padding,
at::IntArrayRef dilation,
bool avg_mode);
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/div_rtn.h>
#include <ATen/Dispatch.h>
#include <ATen/TensorUtils.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <ATen/cuda/detail/TensorInfo.cuh>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/KernelUtils.h>
template <typename dest_t, typename src_t>
static inline dest_t safe_downcast(src_t v)
{
TORCH_CHECK(std::numeric_limits<dest_t>::min() <= v && v <= std::numeric_limits<dest_t>::max(),
"integer out of range");
return static_cast<dest_t>(v);
}
template<typename T>
static inline T pooling_output_shape_pad_lr(
T inputSize, T kernelSize, T pad_l, T pad_r, T stride, T dilation,
bool ceil_mode) {
T outputSize = div_rtn<T>(
inputSize + pad_l + pad_r - dilation * (kernelSize - 1) - 1 +
(ceil_mode ? stride - 1 : 0), stride) + 1;
if (pad_l) {
// ensure that the last pooling starts inside the image
// needed to avoid problems in ceil mode
if ((outputSize - 1) * stride >= inputSize + pad_l)
--outputSize;
}
return outputSize;
}
template<typename T>
static inline T pooling_output_shape(
T inputSize, T kernelSize, T pad, T stride, T dilation, bool ceil_mode) {
return pooling_output_shape_pad_lr(
inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode);
}
static inline void pool2d_shape_check(
const at::Tensor& input,
int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW,
int64_t nInputPlane,
int64_t inputHeight, int64_t inputWidth,
int64_t outputHeight, int64_t outputWidth)
{
const int64_t ndim = input.ndimension();
const int64_t nOutputPlane = nInputPlane;
TORCH_CHECK(kW > 0 && kH > 0,
"kernel size should be greater than zero, but got ",
"kH: ", kH, " kW: ", kW);
TORCH_CHECK(dW > 0 && dH > 0,
"stride should be greater than zero, but got "
"dH: ", dH, " dW: ", dW);
TORCH_CHECK(dilationH > 0 && dilationW > 0,
"dilation should be greater than zero, but got ",
"dilationH: ", dilationH, " dilationW: ", dilationW);
TORCH_CHECK(input.numel() > 0 && (ndim == 3 || ndim == 4),
"non-empty 3D or 4D input tensor expected but got ndim: ", ndim);
//TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
// "pad should be smaller than half of kernel size, but got ",
// "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1,
"Given input size: (",
nInputPlane, "x", inputHeight, "x", inputWidth, "). ",
"Calculated output size: (",
nOutputPlane, "x", outputHeight, "x", outputWidth, "). ",
"Output size is too small");
}
template <typename scalar_t, typename accscalar_t>
__global__ void conv_rectify_cuda_frame(
const int nthreads,
//const scalar_t* const bottom_data,
const int num, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w,
scalar_t* const top_data,
bool average_mode) {
CUDA_KERNEL_LOOP(index, nthreads) {
const int pw = index % pooled_width;
const int ph = (index / pooled_width) % pooled_height;
//const int c = (index / pooled_width / pooled_height) % channels;
//const int n = index / pooled_width / pooled_height / channels;
int hstart = ph * stride_h - pad_h;
int wstart = pw * stride_w - pad_w;
int hend = min(hstart + kernel_h, height + pad_h);
int wend = min(wstart + kernel_w, width + pad_w);
const int pool_size = ((kernel_h - 1) / dilation_h + 1) * ((kernel_w - 1) / dilation_w + 1);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, height);
wend = min(wend, width);
accscalar_t mul_factor;
int hcount = int(((hend - hstart) - 1) / dilation_h + 1);
int wcount = int(((wend - wstart) - 1) / dilation_w + 1);
if (average_mode) {
mul_factor = accscalar_t(1.0) / (hcount * wcount);
}
else {
mul_factor = accscalar_t(1.0) * pool_size / (hcount * wcount);
}
top_data[index] = ScalarConvert<accscalar_t, scalar_t>::to(top_data[index] * mul_factor);
}
}
void conv_rectify_cuda_tempalte(
at::Tensor& output,
const at::Tensor& input_,
at::IntArrayRef kernel_size,
at::IntArrayRef stride,
at::IntArrayRef padding,
at::IntArrayRef dilation,
bool average)
{
//at::TensorArg output_arg{ output, "output", 1 };
//at::TensorArg input_arg{ input_, "input_", 2 };
//checkAllSameGPU("rectify_out_cuda", {output_arg, input_arg});
// #20866, #22032: Guarantee this for the official C++ API?
TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
"rectify: kernel_size must either be a single int, or a tuple of two ints");
const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
"rectify: stride must either be omitted, a single int, or a tuple of two ints");
const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
const int dW = stride.empty() ? kW :
stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
"rectify: padding must either be a single int, or a tuple of two ints");
const int padH = safe_downcast<int, int64_t>(padding[0]);
const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
"rectify: dilation must either be a single int, or a tuple of two ints");
const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
"non-empty 3D or 4D (batch mode) tensor expected for input");
const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
const int64_t nInputPlane = input_.size(-3);
const int64_t inputHeight = input_.size(-2);
const int64_t inputWidth = input_.size(-1);
//const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, false);
//const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, false);
const int64_t outputHeight = output.size(-2);
const int64_t outputWidth = output.size(-1);
pool2d_shape_check(
input_,
kH, kW, dH, dW, padH, padW, dilationH, dilationW,
nInputPlane,
inputHeight, inputWidth,
outputHeight, outputWidth);
at::Tensor input = input_.contiguous();
//output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
const int32_t count = safe_downcast<int32_t, int64_t>(output.numel());
const uint32_t num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
const uint32_t num_blocks = at::cuda::ATenCeilDiv<uint32_t>(count, num_threads);
AT_DISPATCH_FLOATING_TYPES(input.type(), "conv_rectify_cuda_frame", ([&] {
//using accscalar_t = acc_type<scalar_t, true>;
scalar_t *output_data = output.data_ptr<scalar_t>();
conv_rectify_cuda_frame<scalar_t, scalar_t>
<<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
count,
nbatch,
nInputPlane,
inputHeight, inputWidth,
outputHeight, outputWidth,
kH, kW,
dH, dW,
padH, padW,
dilationH, dilationW,
output_data,
average);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
void CONV_RECTIFY_CUDA(
at::Tensor& output,
const at::Tensor& input,
at::IntArrayRef kernel_size,
at::IntArrayRef stride,
at::IntArrayRef padding,
at::IntArrayRef dilation,
bool average) {
//at::Tensor output = at::empty({0}, input.options());
conv_rectify_cuda_tempalte(
output,
input,
kernel_size,
stride,
padding,
dilation,
average);
}
...@@ -8,10 +8,10 @@ setup( ...@@ -8,10 +8,10 @@ setup(
'operator.cpp', 'operator.cpp',
'activation_kernel.cu', 'activation_kernel.cu',
'encoding_kernel.cu', 'encoding_kernel.cu',
'encodingv2_kernel.cu',
'syncbn_kernel.cu', 'syncbn_kernel.cu',
'roi_align_kernel.cu', 'roi_align_kernel.cu',
'nms_kernel.cu', 'nms_kernel.cu',
'rectify.cu',
]), ]),
], ],
cmdclass={ cmdclass={
......
#include <vector>
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <vector>
#include "common.h" #include "common.h"
#include "device_tensor.h" #include "device_tensor.h"
......
from .model_zoo import get_model from .model_zoo import get_model
from .model_store import get_model_file from .model_store import get_model_file
from .resnet import *
from .cifarresnet import *
from .base import *
from .fcn import *
from .psp import *
from .encnet import *
from .deeplab import *
def get_segmentation_model(name, **kwargs): from .sseg import get_segmentation_model, MultiEvalModule
from .fcn import get_fcn
models = {
'fcn': get_fcn,
'psp': get_psp,
'atten': get_atten,
'encnet': get_encnet,
'encnetv2': get_encnetv2,
'deeplab': get_deeplab,
}
return models[name.lower()](**kwargs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment