ResNeSt plus (#256)

b872eb8c · Hang Zhang · GitHub · 5a1e3fbc · b872eb8c · b872eb8c
Unverified Commit b872eb8c authored Apr 17, 2020 by Hang Zhang Committed by GitHub Apr 17, 2020
20 changed files
--- a/encoding/datasets/cityscapescoarse.py
+++ b/encoding/datasets/cityscapescoarse.py
+###########################################################################
+# Created by: Hang Zhang
+# Email: zhang.hang@rutgers.edu
+# Copyright (c) 2017
+###########################################################################
+
+import os
+import sys
+import numpy as np
+import random
+import math
+from tqdm import tqdm
+from PIL import Image, ImageOps, ImageFilter
+
+import torch
+import torch.utils.data as data
+import torchvision.transforms as transform
+
+class Segmentation(data.Dataset):
+    BASE_DIR = 'cityscapes'
+    
+    def __init__(self, data_folder, mode='train', transform=None, 
+                 target_transform=None):
+        self.root = os.path.join(data_folder, self.BASE_DIR)
+        self.transform = transform
+        self.target_transform = target_transform
+        self.mode = mode
+        self.images, self.masks = get_city_pairs(self.root, mode)
+        assert (len(self.images) == len(self.masks))
+        if len(self.images) == 0:
+            raise(RuntimeError("Found 0 images in subfolders of: \
+                " + self.root + "\n"))
+
+    def __getitem__(self, index):
+        img = Image.open(self.images[index]).convert('RGB')
+        if self.mode == 'test':
+            if self.transform is not None:
+                img = self.transform(img)
+            return img, os.path.basename(self.images[index])
+
+        mask = Image.open(self.masks[index])#.convert("P")
+        mask = np.array(mask) 
+        mask += 1
+        mask[mask==256] = 0
+        mask = Image.fromarray(mask)
+        # synchrosized transform
+        if self.mode == 'train':
+            img, mask = self._sync_transform(img, mask)
+        elif self.mode == 'val':
+            img, mask = self._val_sync_transform(img, mask)
+        else:
+            raise RuntimeError('unknown mode for dataloader: {}'.format(self.mode))
+        
+
+        # general resize, normalize and toTensor
+        if self.transform is not None:
+            #print("transform for input")
+            img = self.transform(img)
+        if self.target_transform is not None:
+            #print("transform for label")
+            mask = self.target_transform(mask)
+
+        return img, mask
+
+    def __len__(self):
+        return len(self.images)
+
+    def _val_sync_transform(self, img, mask):
+        """
+        synchronized transformation
+        """
+        outsize = 720
+        short = outsize
+        w, h = img.size
+        if w > h:
+            oh = short
+            ow = int(1.0 * w * oh / h)
+        else:
+            ow = short
+            oh = int(1.0 * h * ow / w)
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # center crop
+        w, h = img.size
+        x1 = int(round((w - outsize) / 2.))
+        y1 = int(round((h - outsize) / 2.))
+        img = img.crop((x1, y1, x1+outsize, y1+outsize))
+        mask = mask.crop((x1, y1, x1+outsize, y1+outsize))
+
+        return img, mask
+
+    def _sync_transform(self, img, mask):
+        # random mirror
+        if random.random() < 0.5:
+            img  = img.transpose(Image.FLIP_LEFT_RIGHT)
+            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
+        base_size = 2048
+        crop_size = 720
+        # random scale (short edge from 480 to 720)
+        long_size = random.randint(int(base_size*0.5), int(base_size*2.0))
+        w, h = img.size
+        if h > w:
+            oh = long_size
+            ow = int(1.0 * w * oh / h)
+            short_size = ow
+        else:
+            ow = long_size
+            oh = int(1.0 * h * ow / w)
+            short_size = oh
+        img = img.resize((ow, oh), Image.BILINEAR)
+        mask = mask.resize((ow, oh), Image.NEAREST)
+        # random rotate -10~10, mask using NN rotate
+        deg = random.uniform(-10,10)
+        img = img.rotate(deg, resample=Image.BILINEAR)
+        mask = mask.rotate(deg, resample=Image.NEAREST)
+        # pad crop
+        if short_size < crop_size:
+            padh = crop_size - oh if oh < crop_size else 0
+            padw = crop_size - ow if ow < crop_size else 0
+            img  = ImageOps.expand(img,  border=(0,0,padw,padh), fill=0)
+            mask = ImageOps.expand(mask, border=(0,0,padw,padh), fill=0)
+        # random crop 480
+        w, h = img.size
+        x1 = random.randint(0, w - crop_size)
+        y1 = random.randint(0, h - crop_size) 
+        img = img.crop((x1, y1, x1+crop_size, y1+crop_size))
+        mask = mask.crop((x1, y1, x1+crop_size, y1+crop_size))
+        # gaussian blur as in PSP ?
+        if random.random() < 0.5:
+            img = img.filter(ImageFilter.GaussianBlur(
+                radius=random.random()))
+        return img, mask
+
+
+def get_city_pairs(folder, mode='train'):
+    img_paths = []  
+    mask_paths = []  
+    if mode=='train':
+        img_folder = os.path.join(folder, 'leftImg8bit/train_extra')
+        mask_folder = os.path.join(folder, 'gtCoarse/train_extra')
+    else:
+        img_folder = os.path.join(folder, 'leftImg8bit/val')
+        mask_folder = os.path.join(folder, 'gtFine/val')
+    for root, directories, files in os.walk(img_folder):
+        for filename in files:
+            basename, extension =os.path.splitext(filename)
+            if filename.endswith(".png"):
+                imgpath = os.path.join(root, filename)
+                foldername = os.path.basename(os.path.dirname(imgpath))
+                maskname = filename.replace('leftImg8bit','gtCoarse_trainIds')
+                maskpath = os.path.join(mask_folder, foldername, maskname)
+                if os.path.isfile(imgpath) and os.path.isfile(maskpath):
+                    img_paths.append(imgpath)
+                    mask_paths.append(maskpath)
+                else:
+                    print('cannot find the mask or image:', imgpath, maskpath)
+
+    return img_paths, mask_paths
--- a/encoding/datasets/folder.py
+++ b/encoding/datasets/folder.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+
+import os
+import sys
+import numpy as np
+import random
+import math
+
+import torch.utils.data as data
+from PIL import Image, ImageOps
+
+import torch.utils.data as data
+import torchvision.transforms as transform
+from .dataset import ToLabel
+
+class FolderLoader(data.Dataset):
+    def __init__(self, root, transform=None):
+        self.root = root
+        self.transform = transform
+        self.images = get_folder_images(root)
+        if len(self.images) == 0:
+            raise(RuntimeError("Found 0 images in subfolders of: \
+                " + self.root + "\n"))
+
+    def __getitem__(self, index):
+        img = Image.open(self.images[index]).convert('RGB')
+        if self.transform is not None:
+            img = self.transform(img)
+        return img, os.path.basename(self.images[index])
+
+    def __len__(self):
+        return len(self.images)
+
+
+def get_folder_images(img_folder):
+    img_paths = []  
+    for filename in os.listdir(img_folder):
+        if filename.endswith(".jpg"):
+            imgpath = os.path.join(img_folder, filename)
+            img_paths.append(imgpath)
+    return img_paths
+
+
+
+class Dataloder():
+    def __init__(self, args):
+        # the data augmentation is implemented as part of the dataloader
+        assert(args.test)
+        input_transform = transform.Compose([
+            transform.ToTensor(),
+            transform.Normalize(args.mean, args.std)])
+        args.test_batch_size = 1 
+
+        assert(args.test_folder is not None)
+        print('loading the data from: {}'.format(args.test_folder))
+
+        testset = FolderLoader(args.test_folder, input_transform)
+        kwargs = {'num_workers': args.workers, 'pin_memory': True} \
+            if args.cuda else {}
+        self.trainloader = None
+        self.testloader = data.DataLoader(testset,
+                                     batch_size=args.test_batch_size,
+                                     shuffle=False, **kwargs)
+
+    def getloader(self):
+        return self.trainloader, self.testloader
--- a/encoding/datasets/hpw18.py
+++ b/encoding/datasets/hpw18.py
+# created by: Sean Liu
+# Amazon Lab 126
+from __future__ import print_function
+
+import errno
+import hashlib
+import os
+import sys
+import tarfile
+import numpy as np
+import random
+import math
+
+import torch.utils.data as data
+import PIL
+from PIL import Image, ImageOps
+
+from six.moves import urllib
+
+
+class Segmentation_HPW18(data.Dataset):
+    CLASSES = [
+        'background', 'hat', 'hair', 'sunglasses', 'upper-clothes', 
+        'skirt', 'pants', 'dress', 'belt', 'left-shoe', 'right-shoe', 
+        'face', 'left-leg', 'right-leg', 'left-arm', 'right-arm', 'bag', 
+        'scarf'
+    ]
+
+    URL = "/cvdata1/lliuqian/humanParsingDataset"
+    FILE = "hpw18.tar.gz"
+    MD5 = ''
+    BASE_DIR = ''
+
+    def __init__(self,
+                 root,
+                 train=True,
+                 transform=None,
+                 target_transform=None,
+                 download=False):
+        self.root = root
+        _hpw18_root = os.path.join(self.root, self.BASE_DIR)
+        _mask_dir = os.path.join(_hpw18_root, 'SegmentationClassAug_256x384')
+        _image_dir = os.path.join(_hpw18_root, 'JPEGImages_256x384')
+        self.transform = transform
+        self.target_transform = target_transform
+        self.train = train
+
+        if download:
+            self._download()
+
+        # train/val/test splits are pre-cut
+        _splits_dir = _hpw18_root
+        _split_f = os.path.join(_splits_dir, 'humanparsingImageMask_256x384_absPath_train.txt')
+        if not self.train:
+            _split_f = os.path.join(_splits_dir, 'humanparsingImageMask_256x384_absPath_val.txt')
+
+        print("reading from ", _split_f)
+
+        self.images = []
+        self.masks = []
+        with open(os.path.join(_split_f), "r") as lines:
+            for line in lines:
+                s = line.split()
+                _image = s[0] # image absolution path
+                _mask = s[1] # mask absolution path
+                assert os.path.isfile(_image)
+                assert os.path.isfile(_mask)
+                self.images.append(_image)
+                self.masks.append(_mask)
+        assert (len(self.images) == len(self.masks))
+
+    def __getitem__(self, index):
+        _img = Image.open(self.images[index]).convert('RGB')
+        _timg = Image.open(self.masks[index])
+        _target = np.array(_timg, dtype=np.uint8)
+        _target = Image.fromarray(_target)
+
+        # synchrosized transform
+        if self.train:
+            _img, _target = self._sync_transform( _img, _target)
+
+        # general resize, normalize and toTensor
+        if self.transform is not None:
+            _img = self.transform(_img)
+        if self.target_transform is not None:
+            _target = self.target_transform(_target)
+
+        return _img, _target
+
+    def __len__(self):
+        return len(self.images)
+
+    def _sync_transform(self, img, mask):
+        # random rotate -10~10
+        deg = random.uniform(-10,10)
+        img = img.rotate(deg)
+        mask = mask.rotate(deg, PIL.Image.NEAREST)
+
+        return img, mask
+
+if __name__ == '__main__':
+    hpw18 = Segmentation_HPW18('/cvdata1/lliuqian/', train=True)
+    print(hpw18[0])
+    print (len(hpw18))
--- a/encoding/datasets/imagenet.py
+++ b/encoding/datasets/imagenet.py
@@ -11,6 +11,9 @@ import os
 import torchvision.transforms as transforms
 import torchvision.datasets as datasets

+import warnings
+warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
+
 class ImageNetDataset(datasets.ImageFolder):
    BASE_DIR = "ILSVRC2012"
    def __init__(self, root=os.path.expanduser('~/.encoding/data'), transform=None,

--- a/encoding/functions/__init__.py
+++ b/encoding/functions/__init__.py
 """Encoding Autograd Fuctions"""
 from .encoding import *
 from .syncbn import *
+from .dist_syncbn import dist_syncbatchnorm
 from .customize import *
+from .rectify import *
--- a/encoding/functions/dist_syncbn.py
+++ b/encoding/functions/dist_syncbn.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## Email: zhanghang0704@gmail.com
+## Copyright (c) 2020
+##
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import torch
+from torch.autograd.function import Function
+from .. import lib
+
+__all__ = ['dist_syncbatchnorm']
+
+class dist_syncbatchnorm_(Function):
+    @staticmethod
+    def forward(ctx, x, gamma, beta, running_mean, running_var, eps, momentum, training, process_group):
+        x = x.contiguous()
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.process_group = process_group
+
+        if not ctx.training:
+            _ex, _var = running_mean.contiguous(), running_var.contiguous()
+            _exs = _var + _ex ** 2 
+            if x.is_cuda:
+                y = lib.gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
+            else:
+                y = lib.cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
+            ctx.save_for_backward(x, _ex, _exs, gamma, beta)
+            return y
+
+        size = x.numel() // x.size(1)
+        if size == 1:
+            raise ValueError('Expected more than 1 value per channel when training, got input size {}'.format(size))
+
+        if x.is_cuda:
+            _ex, _exs = lib.gpu.expectation_forward(x)
+        else:
+            raise NotImplemented
+
+        count = torch.Tensor([1]).to(x.device)
+        count_all_reduce = torch.distributed.all_reduce(count, group=process_group, async_op=True)
+        _ex_all_reduce = torch.distributed.all_reduce(_ex, group=process_group, async_op=True)
+        _exs_all_reduce = torch.distributed.all_reduce(_exs, group=process_group, async_op=True)
+
+        count_all_reduce.wait()
+        _ex_all_reduce.wait()
+        _exs_all_reduce.wait()
+
+        _ex = _ex / count
+        _exs = _exs / count
+
+        # Update running stats
+        _var = _exs - _ex ** 2
+        running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * _ex)
+        running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * _var)
+
+        # Mark in-place modified tensors
+        ctx.mark_dirty(running_mean, running_var)
+
+        # BN forward + activation
+        if x.is_cuda:
+            y = lib.gpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
+        else:
+            y = lib.cpu.batchnorm_forward(x, _ex, _exs, gamma, beta, ctx.eps)
+
+        ctx.save_for_backward(x, _ex, _exs, gamma, beta)
+        return y
+
+    @staticmethod
+    def backward(ctx, dz):
+        x, _ex, _exs, gamma, beta = ctx.saved_tensors
+        dz = dz.contiguous()
+
+        # BN backward
+        if dz.is_cuda:
+            dx, _dex, _dexs, dgamma, dbeta = \
+                lib.gpu.batchnorm_backward(dz, x, _ex, _exs, gamma, beta, ctx.eps)
+        else:
+            raise NotImplemented
+
+        if ctx.training:
+            process_group = ctx.process_group
+            count = torch.Tensor([1]).to(x.device)
+            count_all_reduce = torch.distributed.all_reduce(count, group=process_group, async_op=True)
+            _dex_all_reduce = torch.distributed.all_reduce(_dex, group=process_group, async_op=True)
+            _dexs_all_reduce = torch.distributed.all_reduce(_dexs, group=process_group, async_op=True)
+
+            count_all_reduce.wait()
+            _dex_all_reduce.wait()
+            _dexs_all_reduce.wait()
+
+            _dex = _dex / count
+            _dexs = _dexs / count
+
+            if x.is_cuda:
+                dx_ = lib.gpu.expectation_backward(x, _dex, _dexs)
+            else:
+                raise NotImplemented
+            dx = dx + dx_
+
+        return dx, dgamma, dbeta, None, None, None, None, None, None
+
+dist_syncbatchnorm = dist_syncbatchnorm_.apply
--- a/encoding/functions/rectify.py
+++ b/encoding/functions/rectify.py
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## Email: zhanghang0704@gmail.com
+## Copyright (c) 2020
+##
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+"""Rectify function"""
+import torch
+from torch.autograd import Function
+
+from .. import lib
+
+__all__ = ['rectify']
+
+class _rectify(Function):
+    @staticmethod
+    def forward(ctx, y, x, kernel_size, stride, padding, dilation, average):
+        ctx.save_for_backward(x)
+        # assuming kernel_size is 3
+        kernel_size = [k + 2 * (d - 1) for k,d in zip(kernel_size, dilation)]
+        ctx.kernel_size = kernel_size
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.average = average
+        if x.is_cuda:
+            lib.gpu.conv_rectify(y, x, kernel_size, stride, padding, dilation, average)
+        else:
+            lib.cpu.conv_rectify(y, x, kernel_size, stride, padding, dilation, average)
+        ctx.mark_dirty(y)
+        return y
+
+    @staticmethod
+    def backward(ctx, grad_y):
+        x, = ctx.saved_variables
+        if x.is_cuda:
+            lib.gpu.conv_rectify(grad_y, x, ctx.kernel_size, ctx.stride,
+                                 ctx.padding, ctx.dilation, ctx.average)
+        else:
+            lib.cpu.conv_rectify(grad_y, x, ctx.kernel_size, ctx.stride,
+                                 ctx.padding, ctx.dilation, ctx.average)
+        ctx.mark_dirty(grad_y)
+        return grad_y, None, None, None, None, None, None
+
+rectify = _rectify.apply
--- a/encoding/functions/syncbn.py
+++ b/encoding/functions/syncbn.py
@@ -10,7 +10,7 @@
 """Synchronized Cross-GPU Batch Normalization functions"""
 import torch
 import torch.cuda.comm as comm
-from torch.autograd import Variable, Function
+from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from .. import lib


--- a/encoding/lib/__init__.py
+++ b/encoding/lib/__init__.py
@@ -12,6 +12,7 @@ cpu = load('enclib_cpu', [
        os.path.join(cpu_path, 'syncbn_cpu.cpp'),
        os.path.join(cpu_path, 'roi_align_cpu.cpp'),
        os.path.join(cpu_path, 'nms_cpu.cpp'),
+        os.path.join(cpu_path, 'rectify_cpu.cpp'),
    ], build_directory=cpu_path, verbose=False)

 if torch.cuda.is_available():
@@ -19,9 +20,9 @@ if torch.cuda.is_available():
            os.path.join(gpu_path, 'operator.cpp'),
            os.path.join(gpu_path, 'activation_kernel.cu'),
            os.path.join(gpu_path, 'encoding_kernel.cu'),
-            os.path.join(gpu_path, 'encodingv2_kernel.cu'),
            os.path.join(gpu_path, 'syncbn_kernel.cu'),
            os.path.join(gpu_path, 'roi_align_kernel.cu'),
            os.path.join(gpu_path, 'nms_kernel.cu'),
+            os.path.join(gpu_path, 'rectify_cuda.cu'),
        ], extra_cuda_cflags=["--expt-extended-lambda"],
        build_directory=gpu_path, verbose=False)
--- a/encoding/lib/cpu/operator.cpp
+++ b/encoding/lib/cpu/operator.cpp
@@ -12,4 +12,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("sumsquare_forward", &Sum_Square_Forward_CPU, "SumSqu forward (CPU)");
  m.def("sumsquare_backward", &Sum_Square_Backward_CPU, "SumSqu backward (CPU)");
  m.def("non_max_suppression", &Non_Max_Suppression_CPU, "NMS (CPU)");
+  m.def("conv_rectify", &CONV_RECTIFY_CPU, "Convolution Rectifier (CPU)");
 }
--- a/encoding/lib/cpu/operator.h
+++ b/encoding/lib/cpu/operator.h
@@ -72,3 +72,12 @@ std::vector<at::Tensor> Non_Max_Suppression_CPU(
  const at::Tensor& input,
  const at::Tensor& scores,
  double thresh);
+
+void CONV_RECTIFY_CPU(
+  at::Tensor& output,
+  const at::Tensor& input,
+  at::IntArrayRef kernel_size,
+  at::IntArrayRef stride,
+  at::IntArrayRef padding,
+  at::IntArrayRef dilation,
+  bool avg_mode);
--- a/encoding/lib/cpu/rectify_cpu.cpp
+++ b/encoding/lib/cpu/rectify_cpu.cpp
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Parallel.h>
+#include <tuple>
+
+#include <torch/extension.h>
+#include <ATen/div_rtn.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/AccumulateType.h>
+
+template <typename dest_t, typename src_t>
+static inline dest_t safe_downcast(src_t v)
+{
+  TORCH_CHECK(std::numeric_limits<dest_t>::min() <= v && v <= std::numeric_limits<dest_t>::max(),
+              "integer out of range");
+
+  return static_cast<dest_t>(v);
+}
+
+
+template<typename T>
+static inline T pooling_output_shape_pad_lr(
+        T inputSize, T kernelSize, T pad_l, T pad_r, T stride, T dilation,
+        bool ceil_mode) {
+    T outputSize = div_rtn<T>(
+        inputSize + pad_l + pad_r - dilation * (kernelSize - 1) - 1 +
+        (ceil_mode ? stride - 1 : 0), stride) + 1;
+    if (pad_l) {
+        // ensure that the last pooling starts inside the image
+        // needed to avoid problems in ceil mode
+        if ((outputSize - 1) * stride >= inputSize + pad_l)
+          --outputSize;
+    }
+    return outputSize;
+}
+
+template<typename T>
+static inline T pooling_output_shape(
+      T inputSize, T kernelSize, T pad, T stride, T dilation, bool ceil_mode) {
+    return pooling_output_shape_pad_lr(
+        inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode);
+}
+
+static inline void pool2d_shape_check(
+  const at::Tensor& input,
+  int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW,
+  int64_t nInputPlane,
+  int64_t inputHeight, int64_t inputWidth,
+  int64_t outputHeight, int64_t outputWidth)
+{
+  const int64_t ndim = input.ndimension();
+  const int64_t nOutputPlane = nInputPlane;
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got ",
+              "kH: ", kH, " kW: ", kW);
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got "
+              "dH: ", dH, " dW: ", dW);
+  TORCH_CHECK(dilationH > 0 && dilationW > 0,
+              "dilation should be greater than zero, but got ",
+              "dilationH: ", dilationH, " dilationW: ", dilationW);
+
+  TORCH_CHECK(input.numel() > 0 && (ndim == 3 || ndim == 4),
+              "non-empty 3D or 4D input tensor expected but got ndim: ", ndim);
+  //TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
+  //            "pad should be smaller than half of kernel size, but got ",
+  //            "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
+
+  TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1,
+              "Given input size: (",
+              nInputPlane, "x", inputHeight, "x", inputWidth, "). ",
+              "Calculated output size: (",
+              nOutputPlane, "x", outputHeight, "x", outputWidth, "). ",
+              "Output size is too small");
+}
+
+
+template <typename scalar_t>
+static void conv_rectify_cpu_frame(
+          scalar_t *output_data,
+          int64_t nbatch,
+          int64_t nInputPlane,
+          int64_t inputWidth,
+          int64_t inputHeight,
+          int64_t outputWidth,
+          int64_t outputHeight,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          const int dilation_h,
+          const int dilation_w,
+          bool average_mode) {
+  //at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
+  for (int64_t k = 0; k < nInputPlane; k++) {
+    int64_t p;
+    for(p = 0; p < nbatch; p++)
+    {
+      int64_t xx, yy;
+      /* For all output pixels... */
+      scalar_t *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      //int64_t i;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          int64_t hstart = yy * dH - padH;
+          int64_t wstart = xx * dW - padW;
+          int64_t hend = std::min(hstart + kH, inputHeight + padH);
+          int64_t wend = std::min(wstart + kW, inputWidth + padW);
+          //int pool_size = (hend - hstart) * (wend - wstart);
+          int pool_size = ((kH - 1) / dilation_h + 1) * ((kW - 1) / dilation_w + 1);
+          hstart = std::max(hstart, (int64_t) 0);
+          wstart = std::max(wstart, (int64_t) 0);
+          hend = std::min(hend, inputHeight);
+          wend = std::min(wend, inputWidth);
+          int hcount = int(((hend - hstart) - 1) / dilation_h + 1);
+          int wcount = int(((wend - wstart) - 1) / dilation_w + 1);
+
+          scalar_t mul_factor;
+          if (average_mode) {
+            mul_factor = scalar_t(1.0) / (hcount * wcount);
+          }
+          else {
+            mul_factor = scalar_t(1.0) * pool_size / (hcount * wcount);
+          }
+          *ptr_output++ *= mul_factor;
+        }
+      }
+    }
+  }
+  //});
+}
+
+void conv_rectify_cpu_tempalte(
+          at::Tensor &output,
+          const at::Tensor &input_,
+          at::IntArrayRef kernel_size,
+          at::IntArrayRef stride, 
+          at::IntArrayRef padding,
+          at::IntArrayRef dilation,
+          bool average_mode)
+{
+  // #20866, #22032: Guarantee this for the official C++ API?
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
+    "conv_rectify: kernel_size must either be a single int, or a tuple of two ints");
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+
+  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
+    "conv_rectify: stride must either be omitted, a single int, or a tuple of two ints");
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
+    "conv_rectify: padding must either be a single int, or a tuple of two ints");
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
+    "rectify: dilation must either be a single int, or a tuple of two ints");
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
+    "non-empty 2D or 3D (batch mode) tensor expected for input");
+
+  /* sizes */
+  const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
+  const int64_t nInputPlane = input_.size(-3);
+  const int64_t inputHeight = input_.size(-2);
+  const int64_t inputWidth = input_.size(-1);
+
+  //const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, false);
+  //const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, false);
+  const int64_t outputHeight = output.size(-2);
+  const int64_t outputWidth = output.size(-1);
+
+  pool2d_shape_check(
+    input_,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane,
+    inputHeight, inputWidth,
+    outputHeight, outputWidth);
+
+  TORCH_CHECK(output.is_contiguous(), "conv_rectify: output must be contiguous");
+
+  at::Tensor input = input_.contiguous();
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "conv_rectify_cuda_frame", ([&] {
+      scalar_t *output_data = output.data_ptr<scalar_t>();
+      conv_rectify_cpu_frame<scalar_t>(
+        output_data,
+        nbatch,
+        nInputPlane,
+        inputWidth, inputHeight,
+        outputWidth, outputHeight,
+        kW, kH,
+        dW, dH,
+        padW, padH,
+        dilationH,
+        dilationW,
+        average_mode);
+    }
+  ));
+}
+
+void CONV_RECTIFY_CPU(
+  at::Tensor& output,
+  const at::Tensor& input,
+  at::IntArrayRef kernel_size,
+  at::IntArrayRef stride,
+  at::IntArrayRef padding,
+  at::IntArrayRef dilation,
+  bool average) {
+  //at::Tensor output = at::empty({0}, input.options());
+  conv_rectify_cpu_tempalte(
+    output,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    average);
+}
+
+
--- a/encoding/lib/gpu/activation_kernel.cu
+++ b/encoding/lib/gpu/activation_kernel.cu
-#include <vector>
 #include <torch/extension.h>
 #include <ATen/ATen.h>
-// #include <ATen/cuda/CUDAContext.h>
+#include <vector>

 #include <cuda_runtime_api.h>
-
 #include <thrust/device_ptr.h>
 #include <thrust/transform.h>
+#include "common.h"


 namespace {

--- a/encoding/lib/gpu/encodingv2_kernel.cu
+++ b/encoding/lib/gpu/encodingv2_kernel.cu
-#include <vector>
-#include <torch/extension.h>
-#include <ATen/ATen.h>
-#include <ATen/Functions.h>
-#include <ATen/cuda/CUDAContext.h>
-
-#include "common.h"
-#include "device_tensor.h"
-
-namespace {
-
-template<typename DType, typename Acctype>
-struct KD2Op {
-  __device__ KD2Op(DeviceTensor<DType, 3> x,
-                   DeviceTensor<DType, 2> c,
-                   DeviceTensor<DType, 2> std) : X(x), C(c), STD(std) {}
-  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) 
-  {
-      DType r = (X[b][i][d] - C[k][d]) / STD[k][d];
-      return ScalarConvert<DType, Acctype>::to(r * r);
-  }
-  DeviceTensor<DType, 3> X;
-  DeviceTensor<DType, 2> C;
-  DeviceTensor<DType, 2> STD;
-};
-
-template<typename DType, typename Acctype>
-__global__ void Encoding_Dist_Forward_kernel (
-    DeviceTensor<DType, 3> KD,
-    DeviceTensor<DType, 3> X,
-    DeviceTensor<DType, 2> C,
-    DeviceTensor<DType, 2> STD) {
-  /* declarations of the variables */
-  int b, k, i, D;
-  /* Get the index and channels */ 
-  b = blockIdx.z;
-  k = blockIdx.x;
-  i = blockIdx.y;
-  D = X.getSize(2);
-  /* main operation */
-  KD2Op<DType, Acctype> g(X, C, STD);
-  KD[b][i][k] = reduceD<Acctype>(g, b, i, k, D);;
-}
-
-template<typename DType, typename Acctype>
-struct EncGradXOp {
-  __device__ EncGradXOp(
-    DeviceTensor<DType, 3> gkd,
-    DeviceTensor<DType, 3> x,
-    DeviceTensor<DType, 2> c,
-    DeviceTensor<DType, 2> std) : GKD(gkd), X(x), C(c), STD(std) {}
-    // DeviceTensor<DType, 1> s, S(s)
-  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
-    return ScalarConvert<DType, Acctype>::to(
-      2 * GKD[b][i][k] * (X[b][i][d] - C[k][d]) / 
-      (STD[k][d] * STD[k][d]));
-  }
-  DeviceTensor<DType, 3> GKD;
-  DeviceTensor<DType, 3> X;
-  DeviceTensor<DType, 2> C;
-  DeviceTensor<DType, 2> STD;
-  // DeviceTensor<DType, 1> S;
-};
-
-template<typename DType, typename Acctype>
-__global__ void Encoding_GradX_kernel (
-    DeviceTensor<DType, 3> GKD,
-    DeviceTensor<DType, 3> GX,
-    DeviceTensor<DType, 3> X,
-    DeviceTensor<DType, 2> C,
-    DeviceTensor<DType, 2> STD) {
-    // DeviceTensor<DType, 1> S
-  /* declarations of the variables */
-  int b, d, i, K;
-  /* Get the index and channels */ 
-  b = blockIdx.z;
-  i = blockIdx.y;
-  d = blockIdx.x;
-  K = C.getSize(0);
-  /* main operation */
-  EncGradXOp<DType, Acctype> g(GKD, X, C, STD);
-  GX[b][i][d] = reduceK<Acctype>(g, b, i, d, K);
-}
-
-template<typename DType, typename Acctype>
-struct EncGradSTDOp {
-  __device__ EncGradSTDOp(
-    DeviceTensor<DType, 3> gkd,
-    DeviceTensor<DType, 3> x,
-    DeviceTensor<DType, 2> c,
-    DeviceTensor<DType, 2> std) : GKD(gkd), X(x), C(c), STD(std) {}
-    // DeviceTensor<DType, 1> s, S(s)
-  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
-    return ScalarConvert<DType, Acctype>::to(
-      -2 * GKD[b][i][k] * (X[b][i][d] - C[k][d]) *
-      (X[b][i][d] - C[k][d]) / (STD[k][d] * STD[k][d] * STD[k][d]));
-  }
-  DeviceTensor<DType, 3> GKD;
-  DeviceTensor<DType, 3> X;
-  DeviceTensor<DType, 2> C;
-  DeviceTensor<DType, 2> STD;
-  // DeviceTensor<DType, 1> S;
-};
-
-template<typename DType, typename Acctype>
-__global__ void Encoding_GradCSTD_kernel (
-    DeviceTensor<DType, 3> GKD,
-    DeviceTensor<DType, 2> GC,
-    DeviceTensor<DType, 2> GSTD,
-    DeviceTensor<DType, 3> X,
-    DeviceTensor<DType, 2> C,
-    DeviceTensor<DType, 2> STD) {
-  /* declarations of the variables */
-  int k, d, B, N;
-  /* Get the index and channels */ 
-  d = blockIdx.x;
-  k = blockIdx.y;
-  B = X.getSize(0);
-  N = X.getSize(1);
-  /* main operation */
-  EncGradXOp<DType, Acctype> g1(GKD, X, C, STD);
-  EncGradSTDOp<DType, Acctype> g2(GKD, X, C, STD);
-  GC[k][d] = -reduceBN<Acctype>(g1, k, d, B, N);
-  GSTD[k][d] += reduceBN<Acctype>(g2, k, d, B, N);
-}
-
-template<typename DType, typename Acctype>
-struct EncGradSTDXOp {
-  __device__ EncGradSTDXOp(
-    DeviceTensor<DType, 2> gstd,
-    DeviceTensor<DType, 3> x,
-    DeviceTensor<DType, 2> c,
-    DeviceTensor<DType, 2> std) : GSTD(gstd), X(x), C(c), STD(std) {}
-  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
-    return ScalarConvert<DType, Acctype>::to(
-      GSTD[k][d] * (X[b][i][d] - C[k][d]) / STD[k][d]);
-  }
-  DeviceTensor<DType, 2> GSTD;
-  DeviceTensor<DType, 3> X;
-  DeviceTensor<DType, 2> C;
-  DeviceTensor<DType, 2> STD;
-};
-
-template<typename DType, typename Acctype>
-__global__ void Encoding_GradSTDX_kernel (
-    DeviceTensor<DType, 2> GSTD,
-    DeviceTensor<DType, 3> GX,
-    DeviceTensor<DType, 3> X,
-    DeviceTensor<DType, 2> C,
-    DeviceTensor<DType, 2> STD,
-    int N) {
-  /* declarations of the variables */
-  int b, d, i, K;
-  /* Get the index and channels */ 
-  b = blockIdx.z;
-  i = blockIdx.y;
-  d = blockIdx.x;
-  K = C.getSize(0);
-  /* main operation */
-  EncGradSTDXOp<DType, Acctype> g(GSTD, X, C, STD);
-  GX[b][i][d] += reduceK<Acctype>(g, b, i, d, K) / N;
-}
-
-template<typename DType, typename Acctype>
-struct AggOpV2 {
-  __device__ AggOpV2(DeviceTensor<DType, 3> a,
-                     DeviceTensor<DType, 3> x,
-                     DeviceTensor<DType, 2> c,
-                     DeviceTensor<DType, 2> std) : A(a), X(x), C(c), STD(std) {}
-  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
-    return ScalarConvert<DType, Acctype>::to(A[b][i][k] * (X[b][i][d] - C[k][d]) /
-                                             STD[k][d]);
-  }
-  DeviceTensor<DType, 3> A;
-  DeviceTensor<DType, 3> X;
-  DeviceTensor<DType, 2> C;
-  DeviceTensor<DType, 2> STD;
-};
-
-template<typename DType, typename Acctype>
-__global__ void AggregateV2_Forward_kernel (
-    DeviceTensor<DType, 3> E,
-    DeviceTensor<DType, 3> A,
-    DeviceTensor<DType, 3> X,
-    DeviceTensor<DType, 2> C,
-    DeviceTensor<DType, 2> STD) {
-  /* declarations of the variables */
-  int b, k, d, N;
-  /* Get the index and channels */ 
-  b = blockIdx.z;
-  d = blockIdx.x;
-  k = blockIdx.y;
-  N = X.getSize(1);
-  /* main operation */
-  AggOpV2<DType, Acctype> g(A, X, C, STD);
-  E[b][k][d] = reduceN<Acctype>(g, b, k, d, N);
-}
-
-template<typename DType, typename Acctype>
-struct AggV2BackOp {
-  __device__ AggV2BackOp(DeviceTensor<DType, 3> g,
-                         DeviceTensor<DType, 3> x,
-                         DeviceTensor<DType, 2> c,
-                         DeviceTensor<DType, 2> std) : G(g), X(x), C(c), STD(std) {}
-  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
-    return ScalarConvert<DType, Acctype>::to(G[b][k][d] * (X[b][i][d] - C[k][d]) /
-                                             STD[k][d]);
-  }
-  DeviceTensor<DType, 3> G;
-  DeviceTensor<DType, 3> X;
-  DeviceTensor<DType, 2> C;
-  DeviceTensor<DType, 2> STD;
-};
-
-template<typename DType, typename Acctype>
-__global__ void AggregateV2_Backward_kernel (
-    DeviceTensor<DType, 3> GA,
-    DeviceTensor<DType, 3> GE,
-    DeviceTensor<DType, 3> A,
-    DeviceTensor<DType, 3> X,
-    DeviceTensor<DType, 2> C,
-    DeviceTensor<DType, 2> STD) {
-  /* declarations of the variables */
-  int b, k, i, D;
-  /* Get the index and channels */ 
-  b = blockIdx.z;
-  i = blockIdx.y;
-  k = blockIdx.x;
-  D = GE.getSize(2);
-  /* main operation */
-  AggV2BackOp<DType, Acctype> g(GE, X, C, STD);
-  GA[b][i][k] = reduceD<Acctype>(g, b, i, k, D);
-}
-
-} // namespace
-
-at::Tensor Encoding_Dist_Inference_Forward_CUDA(
-    const at::Tensor X_,
-    const at::Tensor C_,
-    const at::Tensor STD_) {
-    // const at::Tensor S_,
-  // X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
-  auto KD_ = torch::zeros({X_.size(0), X_.size(1), C_.size(0)}, X_.options());
-  // E(x), E(x^2)
-  int N = X_.size(0) * X_.size(1);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
-  dim3 threads(getNumThreads(C_.size(1)));
-  // calculate the kernel distance
-  AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Inference_Forward_CUDA", ([&] {
-    /* Device tensors */
-    DeviceTensor<scalar_t, 3> KD = devicetensor<scalar_t, 3>(KD_);
-    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
-    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
-    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
-    /* kernel function */
-    Encoding_Dist_Forward_kernel<scalar_t, scalar_t>
-        <<<blocks, threads, 0, stream>>> (KD, X, C, STD);
-  }));
-  AT_ASSERT(cudaGetLastError() == cudaSuccess);
-  return KD_;
-}
-
-std::vector<at::Tensor> Encoding_Dist_Inference_Backward_CUDA(
-    const at::Tensor GKD_,
-    const at::Tensor KD_,
-    const at::Tensor X_,
-    const at::Tensor C_,
-    const at::Tensor STD_) {
-  auto GX_ = at::zeros_like(X_);
-  auto GC_ = at::zeros_like(C_);
-  auto GSTD_ = at::zeros_like(STD_);
-  /* kernel function */
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
-  dim3 threads1(getNumThreads(C_.size(0)));
-  dim3 blocks2(C_.size(1), C_.size(0));
-  dim3 threads2(getNumThreads(X_.size(1)));
-  int N = X_.size(0) * X_.size(1);
-  AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Backward_CUDA", ([&] {
-    /* Device tensors */
-    DeviceTensor<scalar_t, 3> GKD = devicetensor<scalar_t, 3>(GKD_);
-    DeviceTensor<scalar_t, 2> GSTD = devicetensor<scalar_t, 2>(GSTD_);
-    DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
-    DeviceTensor<scalar_t, 2> GC = devicetensor<scalar_t, 2>(GC_);
-    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
-    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
-    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
-    Encoding_GradX_kernel<scalar_t, scalar_t>
-      <<<blocks1, threads1, 0, stream>>> (GKD, GX, X, C, STD);
-    AT_ASSERT(cudaGetLastError() == cudaSuccess);
-    Encoding_GradCSTD_kernel<scalar_t, scalar_t>
-      <<<blocks2, threads2, 0, stream>>> (GKD, GC, GSTD, X, C, STD);
-    AT_ASSERT(cudaGetLastError() == cudaSuccess);
-  }));
-  return {GX_, GC_, GSTD_};
-}
-
-std::vector<at::Tensor> Encoding_Dist_Forward_CUDA(
-    const at::Tensor X_,
-    const at::Tensor C_,
-    double eps) {
-    // const at::Tensor S_,
-  // X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
-  auto KD_ = torch::zeros({X_.size(0), X_.size(1), C_.size(0)}, X_.options());
-  // E(x), E(x^2)
-  int N = X_.size(0) * X_.size(1);
-  auto SVar_ = (X_.pow(2).sum(0).sum(0).view({1, X_.size(2)}) -
-                2 * C_ * X_.sum(0).sum(0).view({1, X_.size(2)})).expand_as(C_) +
-               C_.pow(2) * N;
-  auto STD_ = at::sqrt(SVar_ / N + eps);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
-  dim3 threads(getNumThreads(C_.size(1)));
-  // calculate the kernel distance
-  AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Forward_CUDA", ([&] {
-    /* Device tensors */
-    DeviceTensor<scalar_t, 3> KD = devicetensor<scalar_t, 3>(KD_);
-    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
-    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
-    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
-    /* kernel function */
-    Encoding_Dist_Forward_kernel<scalar_t, scalar_t>
-        <<<blocks, threads, 0, stream>>> (KD, X, C, STD);
-  }));
-  AT_ASSERT(cudaGetLastError() == cudaSuccess);
-  return {KD_, STD_, SVar_ / (N - 1)};
-}
-
-std::vector<at::Tensor> Encoding_Dist_Backward_CUDA(
-    const at::Tensor GKD_,
-    const at::Tensor GSTD_,
-    const at::Tensor KD_,
-    const at::Tensor X_,
-    const at::Tensor C_,
-    const at::Tensor STD_) {
-  auto GX_ = at::zeros_like(X_);
-  auto GC_ = at::zeros_like(C_);
-  auto GSTD2_ = GSTD_.clone();
-  /* kernel function */
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
-  dim3 threads1(getNumThreads(C_.size(0)));
-  dim3 blocks2(C_.size(1), C_.size(0));
-  dim3 threads2(getNumThreads(X_.size(1)));
-  int N = X_.size(0) * X_.size(1);
-  AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Backward_CUDA", ([&] {
-    /* Device tensors */
-    DeviceTensor<scalar_t, 3> GKD = devicetensor<scalar_t, 3>(GKD_);
-    DeviceTensor<scalar_t, 2> GSTD = devicetensor<scalar_t, 2>(GSTD2_);
-    DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
-    DeviceTensor<scalar_t, 2> GC = devicetensor<scalar_t, 2>(GC_);
-    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
-    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
-    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
-    Encoding_GradX_kernel<scalar_t, scalar_t>
-      <<<blocks1, threads1, 0, stream>>> (GKD, GX, X, C, STD);
-    AT_ASSERT(cudaGetLastError() == cudaSuccess);
-    Encoding_GradCSTD_kernel<scalar_t, scalar_t>
-      <<<blocks2, threads2, 0, stream>>> (GKD, GC, GSTD, X, C, STD);
-    AT_ASSERT(cudaGetLastError() == cudaSuccess);
-    Encoding_GradSTDX_kernel<scalar_t, scalar_t>
-      <<<blocks1, threads1, 0, stream>>> (GSTD, GX, X, C, STD, N);
-    AT_ASSERT(cudaGetLastError() == cudaSuccess);
-  }));
-  // d_sigma/d_c
-  GC_ = GC_ - GSTD2_ * (X_.mean(0).mean(0) - C_) / STD_;
-  return {GX_, GC_};
-}
-
-at::Tensor AggregateV2_Forward_CUDA(
-    const at::Tensor A_,
-    const at::Tensor X_,
-    const at::Tensor C_,
-    const at::Tensor STD_) {
-  /* Device tensors */
-  auto E_ = torch::zeros({A_.size(0), C_.size(0), C_.size(1)}, A_.options());
-  // auto IS_ = 1.0f / (S_ + eps).sqrt();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  // B, K, D
-  dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
-  dim3 threads(getNumThreads(X_.size(1)));
-
-  AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Forward_CUDA", ([&] {
-    DeviceTensor<scalar_t, 3> E = devicetensor<scalar_t, 3>(E_);
-    DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
-    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
-    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
-    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
-    /* kernel function */
-    AggregateV2_Forward_kernel<scalar_t, scalar_t>
-      <<<blocks, threads, 0, stream>>>(E, A, X, C, STD);
-  }));
-  AT_ASSERT(cudaGetLastError() == cudaSuccess);
-  return E_;
-}
-
-std::vector<at::Tensor> AggregateV2_Backward_CUDA(
-    const at::Tensor GE_,
-    const at::Tensor E_,
-    const at::Tensor A_,
-    const at::Tensor X_,
-    const at::Tensor C_,
-    const at::Tensor STD_) {
-  auto gradA_ = at::zeros_like(A_);
-  auto gradX_ = at::bmm(A_ , (GE_ / STD_.unsqueeze(0)));
-  auto gradC_ = -(A_.sum(1).unsqueeze(2) * GE_ / STD_.unsqueeze(0)).sum(0);
-  auto gradSTD_ = -(GE_ * E_).sum(0) / STD_;
-  // auto gradS_ = -0.5 * (GE_ * E_).sum(2).sum(0) / S_;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  // B, K, D
-  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
-  dim3 threads(getNumThreads(C_.size(1)));
-  AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Backward_CUDA", ([&] {
-    /* Device tensors */
-    DeviceTensor<scalar_t, 3> GA = devicetensor<scalar_t, 3>(gradA_);
-    DeviceTensor<scalar_t, 3> GE = devicetensor<scalar_t, 3>(GE_);
-    DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
-    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
-    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
-    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
-    AggregateV2_Backward_kernel<scalar_t, scalar_t>
-      <<<blocks, threads, 0, stream>>> (GA, GE, A, X, C, STD);
-  }));
-  AT_ASSERT(cudaGetLastError() == cudaSuccess);
-  return {gradA_, gradX_, gradC_, gradSTD_};
-}
--- a/encoding/lib/gpu/operator.cpp
+++ b/encoding/lib/gpu/operator.cpp
@@ -16,14 +16,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("expectation_backward", &Expectation_Backward_CUDA, "Expectation backward (CUDA)");
  m.def("expectation_inp_backward", &Expectation_Inp_Backward_CUDA,
        "Inplace Expectation backward (CUDA)");
-  m.def("encoding_dist_forward", &Encoding_Dist_Forward_CUDA, "EncDist forward (CUDA)");
-  m.def("encoding_dist_backward", &Encoding_Dist_Backward_CUDA, "Assign backward (CUDA)");
-  m.def("encoding_dist_inference_forward", &Encoding_Dist_Inference_Forward_CUDA,
-        "EncDist Inference forward (CUDA)");
-  m.def("encoding_dist_inference_backward", &Encoding_Dist_Inference_Backward_CUDA,
-        "Assign Inference backward (CUDA)");
-  m.def("aggregatev2_forward", &AggregateV2_Forward_CUDA, "AggregateV2 forward (CUDA)");
-  m.def("aggregatev2_backward", &AggregateV2_Backward_CUDA, "AggregateV2 backward (CUDA)");
  m.def("leaky_relu_forward", &LeakyRelu_Forward_CUDA, "Learky ReLU forward (CUDA)");
  m.def("leaky_relu_backward", &LeakyRelu_Backward_CUDA, "Learky ReLU backward (CUDA)");
+  m.def("conv_rectify", &CONV_RECTIFY_CUDA, "Convolution Rectifier (CUDA)");
 }
--- a/encoding/lib/gpu/operator.h
+++ b/encoding/lib/gpu/operator.h
 #include <torch/extension.h>
+#include <ATen/ATen.h>
 #include <vector>

 at::Tensor ROIAlign_Forward_CUDA(
@@ -102,45 +103,15 @@ at::Tensor Expectation_Inp_Backward_CUDA(
  const at::Tensor beta_,
  float eps);

-at::Tensor Encoding_Dist_Inference_Forward_CUDA(
-  const at::Tensor X_,
-  const at::Tensor C_,
-  const at::Tensor STD_);
-
-std::vector<at::Tensor> Encoding_Dist_Inference_Backward_CUDA(
-  const at::Tensor GKD_,
-  const at::Tensor KD_,
-  const at::Tensor X_,
-  const at::Tensor C_,
-  const at::Tensor STD_);
-
-std::vector<at::Tensor> Encoding_Dist_Forward_CUDA(
-  const at::Tensor X,
-  const at::Tensor C,
-  double eps);
-
-std::vector<at::Tensor> Encoding_Dist_Backward_CUDA(
-  const at::Tensor GKD_,
-  const at::Tensor GSTD_,
-  const at::Tensor KD_,
-  const at::Tensor X_,
-  const at::Tensor C_,
-  const at::Tensor STD_);
-
-at::Tensor AggregateV2_Forward_CUDA(
-  const at::Tensor A_,
-  const at::Tensor X_,
-  const at::Tensor C_,
-  const at::Tensor STD_);
-
-std::vector<at::Tensor> AggregateV2_Backward_CUDA(
-  const at::Tensor GE_,
-  const at::Tensor E_,
-  const at::Tensor A_,
-  const at::Tensor X_,
-  const at::Tensor C_,
-  const at::Tensor STD_);
-
 void LeakyRelu_Forward_CUDA(at::Tensor z, float slope);

 void LeakyRelu_Backward_CUDA(at::Tensor z, at::Tensor dz, float slope);
+
+void CONV_RECTIFY_CUDA(
+  at::Tensor& output,
+  const at::Tensor& input,
+  at::IntArrayRef kernel_size,
+  at::IntArrayRef stride,
+  at::IntArrayRef padding,
+  at::IntArrayRef dilation,
+  bool avg_mode);
--- a/encoding/lib/gpu/rectify_cuda.cu
+++ b/encoding/lib/gpu/rectify_cuda.cu
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/div_rtn.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/KernelUtils.h>
+
+
+template <typename dest_t, typename src_t>
+static inline dest_t safe_downcast(src_t v)
+{
+  TORCH_CHECK(std::numeric_limits<dest_t>::min() <= v && v <= std::numeric_limits<dest_t>::max(),
+              "integer out of range");
+
+  return static_cast<dest_t>(v);
+}
+
+template<typename T>
+static inline T pooling_output_shape_pad_lr(
+        T inputSize, T kernelSize, T pad_l, T pad_r, T stride, T dilation,
+        bool ceil_mode) {
+    T outputSize = div_rtn<T>(
+        inputSize + pad_l + pad_r - dilation * (kernelSize - 1) - 1 +
+        (ceil_mode ? stride - 1 : 0), stride) + 1;
+    if (pad_l) {
+        // ensure that the last pooling starts inside the image
+        // needed to avoid problems in ceil mode
+        if ((outputSize - 1) * stride >= inputSize + pad_l)
+          --outputSize;
+    }
+    return outputSize;
+}
+
+template<typename T>
+static inline T pooling_output_shape(
+      T inputSize, T kernelSize, T pad, T stride, T dilation, bool ceil_mode) {
+    return pooling_output_shape_pad_lr(
+        inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode);
+}
+
+static inline void pool2d_shape_check(
+  const at::Tensor& input,
+  int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW,
+  int64_t nInputPlane,
+  int64_t inputHeight, int64_t inputWidth,
+  int64_t outputHeight, int64_t outputWidth)
+{
+  const int64_t ndim = input.ndimension();
+  const int64_t nOutputPlane = nInputPlane;
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got ",
+              "kH: ", kH, " kW: ", kW);
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got "
+              "dH: ", dH, " dW: ", dW);
+  TORCH_CHECK(dilationH > 0 && dilationW > 0,
+              "dilation should be greater than zero, but got ",
+              "dilationH: ", dilationH, " dilationW: ", dilationW);
+
+  TORCH_CHECK(input.numel() > 0 && (ndim == 3 || ndim == 4),
+              "non-empty 3D or 4D input tensor expected but got ndim: ", ndim);
+  //TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
+  //            "pad should be smaller than half of kernel size, but got ",
+  //            "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
+
+  TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1,
+              "Given input size: (",
+              nInputPlane, "x", inputHeight, "x", inputWidth, "). ",
+              "Calculated output size: (",
+              nOutputPlane, "x", outputHeight, "x", outputWidth, "). ",
+              "Output size is too small");
+}
+
+template <typename scalar_t, typename accscalar_t>
+__global__ void conv_rectify_cuda_frame(
+    const int nthreads,
+    //const scalar_t* const bottom_data,
+    const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w,
+    scalar_t* const top_data,
+    bool average_mode) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    //const int c = (index / pooled_width / pooled_height) % channels;
+    //const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + kernel_h, height + pad_h);
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = ((kernel_h - 1) / dilation_h + 1) * ((kernel_w - 1) / dilation_w + 1);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+    accscalar_t mul_factor;
+    int hcount = int(((hend - hstart) - 1) / dilation_h + 1);
+    int wcount = int(((wend - wstart) - 1) / dilation_w + 1);
+    if (average_mode) {
+      mul_factor = accscalar_t(1.0) / (hcount * wcount);
+    }
+    else {
+      mul_factor = accscalar_t(1.0) * pool_size / (hcount * wcount);
+    }
+    top_data[index] = ScalarConvert<accscalar_t, scalar_t>::to(top_data[index] * mul_factor);
+  }
+}
+
+void conv_rectify_cuda_tempalte(
+  at::Tensor& output,
+  const at::Tensor& input_,
+  at::IntArrayRef kernel_size,
+  at::IntArrayRef stride,
+  at::IntArrayRef padding,
+  at::IntArrayRef dilation,
+  bool average)
+{
+  //at::TensorArg output_arg{ output, "output", 1 };
+  //at::TensorArg input_arg{ input_, "input_", 2 };
+
+  //checkAllSameGPU("rectify_out_cuda", {output_arg, input_arg});
+
+  // #20866, #22032: Guarantee this for the official C++ API?
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
+    "rectify: kernel_size must either be a single int, or a tuple of two ints");
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+
+  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
+    "rectify: stride must either be omitted, a single int, or a tuple of two ints");
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
+    "rectify: padding must either be a single int, or a tuple of two ints");
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
+    "rectify: dilation must either be a single int, or a tuple of two ints");
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
+    "non-empty 3D or 4D (batch mode) tensor expected for input");
+
+  const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
+  const int64_t nInputPlane = input_.size(-3);
+  const int64_t inputHeight = input_.size(-2);
+  const int64_t inputWidth = input_.size(-1);
+
+  //const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, false);
+  //const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, false);
+  const int64_t outputHeight = output.size(-2);
+  const int64_t outputWidth = output.size(-1);
+
+  pool2d_shape_check(
+    input_,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane,
+    inputHeight, inputWidth,
+    outputHeight, outputWidth);
+
+  at::Tensor input = input_.contiguous();
+
+  //output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
+
+  const int32_t count = safe_downcast<int32_t, int64_t>(output.numel());
+  const uint32_t  num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  const uint32_t num_blocks = at::cuda::ATenCeilDiv<uint32_t>(count, num_threads);
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "conv_rectify_cuda_frame", ([&] {
+        //using accscalar_t = acc_type<scalar_t, true>;
+        scalar_t *output_data = output.data_ptr<scalar_t>();
+        conv_rectify_cuda_frame<scalar_t, scalar_t>
+            <<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+            count,
+                nbatch,
+                nInputPlane,
+                inputHeight, inputWidth,
+                outputHeight, outputWidth,
+                kH, kW,
+                dH, dW,
+                padH, padW,
+                dilationH, dilationW,
+                output_data,
+                average);
+  }));
+
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CONV_RECTIFY_CUDA(
+  at::Tensor& output,
+  const at::Tensor& input,
+  at::IntArrayRef kernel_size,
+  at::IntArrayRef stride,
+  at::IntArrayRef padding,
+  at::IntArrayRef dilation,
+  bool average) {
+  //at::Tensor output = at::empty({0}, input.options());
+  conv_rectify_cuda_tempalte(
+    output,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    average);
+}
+
--- a/encoding/lib/gpu/setup.py
+++ b/encoding/lib/gpu/setup.py
@@ -8,10 +8,10 @@ setup(
            'operator.cpp',
            'activation_kernel.cu',
            'encoding_kernel.cu',
-            'encodingv2_kernel.cu',
            'syncbn_kernel.cu',
            'roi_align_kernel.cu',
            'nms_kernel.cu',
+            'rectify.cu',
            ]),
    ],
    cmdclass={

--- a/encoding/lib/gpu/syncbn_kernel.cu
+++ b/encoding/lib/gpu/syncbn_kernel.cu
-#include <vector>
 #include <torch/extension.h>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <vector>

 #include "common.h"
 #include "device_tensor.h"

--- a/encoding/models/__init__.py
+++ b/encoding/models/__init__.py
 from .model_zoo import get_model
 from .model_store import get_model_file
-from .resnet import *
-from .cifarresnet import *
-from .base import *
-from .fcn import *
-from .psp import *
-from .encnet import *
-from .deeplab import *

-def get_segmentation_model(name, **kwargs):
-    from .fcn import get_fcn
-    models = {
-        'fcn': get_fcn,
-        'psp': get_psp,
-        'atten': get_atten,
-        'encnet': get_encnet,
-        'encnetv2': get_encnetv2,
-        'deeplab': get_deeplab,
-    }
-    return models[name.lower()](**kwargs)
+from .sseg import get_segmentation_model, MultiEvalModule