"git@developer.sourcefind.cn:norm/vllm.git" did not exist on "1a7eb7da6157541ed7867c9aff94231695f2cee9"
Unverified Commit 67e153dd authored by Hang Zhang's avatar Hang Zhang Committed by GitHub
Browse files

update and fix bugs (#51)

parent 71447e1b
......@@ -15,8 +15,8 @@ import subprocess
from torch.utils.ffi import create_extension
lib_path = os.path.join(os.path.dirname(torch.__file__), 'lib')
cwd = os.path.dirname(os.path.realpath(__file__))
encoding_lib_path = os.path.join(cwd, "encoding", "lib")
cwd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'encoding/')
encoding_lib_path = os.path.join(cwd, "lib")
# clean the build files
clean_cmd = ['bash', 'clean.sh']
......@@ -25,13 +25,13 @@ subprocess.check_call(clean_cmd)
# build CUDA library
os.environ['TORCH_BUILD_DIR'] = lib_path
if platform.system() == 'Darwin':
os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.1.dylib')
ENCODING_LIB = os.path.join(cwd, 'encoding/lib/libENCODING.dylib')
os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.dylib')
ENCODING_LIB = os.path.join(cwd, 'lib/libENCODING.dylib')
else:
os.environ['CFLAGS'] = '-std=c99'
os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.so.1')
ENCODING_LIB = os.path.join(cwd, 'encoding/lib/libENCODING.so')
os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.so')
ENCODING_LIB = os.path.join(cwd, 'lib/libENCODING.so')
build_all_cmd = ['bash', 'encoding/make.sh']
subprocess.check_call(build_all_cmd, env=dict(os.environ))
......@@ -45,9 +45,9 @@ defines = [('WITH_CUDA', None)]
with_cuda = True
include_path = [os.path.join(lib_path, 'include'),
os.path.join(cwd,'encoding/kernel'),
os.path.join(cwd,'encoding/kernel/include'),
os.path.join(cwd,'encoding/src/')]
os.path.join(cwd,'kernel'),
os.path.join(cwd,'kernel/include'),
os.path.join(cwd,'src/')]
def make_relative_rpath(path):
if platform.system() == 'Darwin':
......@@ -63,6 +63,7 @@ ffi = create_extension(
define_macros=defines,
relative_to=__file__,
with_cuda=with_cuda,
extra_compile_args=["-std=c99"],
include_dirs = include_path,
extra_link_args = [
make_relative_rpath(lib_path),
......
#!/usr/bin/env bash
rm -rf build/ dist/ encoding.egg-info/ encoding/lib/ encoding/_ext/ __pycache__ encoding/__pycache__
rm -rf build/ dist/ torch_encoding.egg-info/ encoding/lib/ encoding/_ext/ __pycache__ encoding/__pycache__
.. role:: hidden
:class: hidden-section
Dilated Networks
encoding.dilated
================
We provide correct dilated pre-trained ResNet and DenseNet (stride of 8) for semantic segmentation.
......
......@@ -4,10 +4,20 @@
encoding.functions
==================
.. automodule:: encoding.functions
.. automodule:: encoding.Functions
.. currentmodule:: encoding.functions
:hidden:`batchnorm`
~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: batchnorm
:hidden:`batchnormeval`
~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: batchnormeval
:hidden:`dilatedavgpool2d`
~~~~~~~~~~~~~~~~~~~~~~~~~~
......
......@@ -10,7 +10,7 @@ Created by `Hang Zhang <http://hangzh.com/>`_
An optimized PyTorch package with CUDA backend.
.. note::
Please checkout the PyTorch compatible Synchronized Cross-GPU :class:`encoding.nn.BatchNorm2d` and the `mnist example <https://github.com/zhanghang1989/PyTorch-SyncBatchNorm>`_.
PyTorch compatible Synchronized Cross-GPU :class:`encoding.nn.SyncBatchNorm2d` and the `MNIST example <https://github.com/zhanghang1989/PyTorch-SyncBatchNorm>`_.
.. toctree::
:glob:
......@@ -30,8 +30,7 @@ An optimized PyTorch package with CUDA backend.
:maxdepth: 1
:caption: Package Reference
encoding
syncbn
nn
parallel
dilated
functions
......
......@@ -5,9 +5,9 @@ Install and Citations
Install from Source
-------------------
* Install PyTorch from Source (recommended). Please follow the `PyTorch instructions <https://github.com/pytorch/pytorch#from-source>`_.
* Install this package
* Install PyTorch by following the `PyTorch instructions <http://pytorch.org/>`_.
* Install from source
- Clone the repo::
......@@ -15,12 +15,10 @@ Install from Source
- On Linux::
pip install -r requirements.txt
python setup.py install
- On Mac OSX::
pip install -r requirements.txt
MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
Citations
......
.. role:: hidden
:class: hidden-section
Data Parallel
=============
encoding.parallel
=================
- Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-balance. We address this issue here by doing Model & CriterionDataParallel.
- Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-balance. We address this issue here by doing DataParallel for Model & Criterion.
.. note::
This code is provided together with the paper
......@@ -15,16 +15,16 @@ Data Parallel
.. automodule:: encoding.parallel
.. currentmodule:: encoding.parallel
:hidden:`ModelDataParallel`
:hidden:`DataParallelModel`
~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: ModelDataParallel
.. autoclass:: DataParallelModel
:members:
:hidden:`CriterionDataParallel`
:hidden:`DataParallelCriterion`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: CriterionDataParallel
.. autoclass:: DataParallelCriterion
:members:
......
.. role:: hidden
:class: hidden-section
My PyTorch Utils
================
encoding.utils
==============
Useful util functions.
.. automodule:: encoding.utils
.. currentmodule:: encoding.utils
:hidden:`LR_Scheduler`
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: LR_Scheduler
:members:
:hidden:`get_optimizer`
~~~~~~~~~~~~~~~~~~~~~~~
......@@ -24,3 +18,13 @@ Useful util functions.
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: save_checkpoint
:hidden:`batch_pix_accuracy`
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: batch_pix_accuracy
:hidden:`batch_intersection_union`
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: batch_intersection_union
"""Dilated ResNet"""
import math
import torch.utils.model_zoo as model_zoo
from .. import nn
#from .. import nn
import torch.nn as nn
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152', 'BasicBlock', 'Bottleneck']
......@@ -25,15 +26,16 @@ class BasicBlock(nn.Module):
"""ResNet BasicBlock
"""
expansion = 1
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1):
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1,
norm_layer=None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=False)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
padding=first_dilation, dilation=first_dilation, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
......@@ -62,18 +64,18 @@ class Bottleneck(nn.Module):
# pylint: disable=unused-argument
expansion = 4
def __init__(self, inplanes, planes, stride=1, dilation=1,
downsample=None, first_dilation=1):
downsample=None, first_dilation=1, norm_layer=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.bn1 = norm_layer(planes)
self.conv2 = nn.Conv2d(
planes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.bn2 = norm_layer(planes)
self.conv3 = nn.Conv2d(
planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.bn3 = norm_layer(planes * 4)
self.relu = nn.ReLU(inplace=False)
self.downsample = downsample
self.dilation = dilation
self.stride = stride
......@@ -118,18 +120,18 @@ class ResNet(nn.Module):
- Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
"""
# pylint: disable=unused-variable
def __init__(self, block, layers, num_classes=1000):
def __init__(self, block, layers, num_classes=1000, norm_layer=None):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.bn1 = norm_layer(64)
self.relu = nn.ReLU(inplace=False)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4)
self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer)
self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes)
......@@ -137,32 +139,33 @@ class ResNet(nn.Module):
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
elif isinstance(m, norm_layer):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
norm_layer(planes * block.expansion),
)
layers = []
if dilation == 1 or dilation == 2:
layers.append(block(self.inplanes, planes, stride, dilation=1,
downsample=downsample, first_dilation=dilation))
downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
elif dilation == 4:
layers.append(block(self.inplanes, planes, stride, dilation=2,
downsample=downsample, first_dilation=dilation))
downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
else:
raise RuntimeError("=> unknown dilation size: {}".format(dilation))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation))
layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
......
......@@ -8,55 +8,54 @@
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Synchronized Batch Normalization functions"""
"""Synchronized Cross-GPU Batch Normalization functions"""
import torch
from torch.autograd import Function, Variable
from torch.autograd import Variable, Function
from .._ext import encoding_lib
__all__ = ['sum_square', 'batchnormtrain', 'batchnormeval']
def sum_square(input):
r"""Calculate sum of elements and sum of squares for Batch Normalization"""
return _sum_square.apply(input)
class _sum_square(Function):
@staticmethod
def forward(ctx, input):
ctx.save_for_backward(input)
B, C, _, _ = input.size()
C = input.size(1)
with torch.cuda.device_of(input):
xsum = input.new().resize_(C).zero_()
xsquare = input.new().resize_(C).zero_()
if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_sum_square_Forward(
input.view(B, C, -1), xsum, xsquare)
input, xsum, xsquare)
elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_sum_square_Forward(
input.view(B, C, -1), xsum, xsquare)
input, xsum, xsquare)
else:
raise RuntimeError('Unimplemented data type!')
raise RuntimeError('Unimplemented data type!', type(input))
return xsum, xsquare
@staticmethod
def backward(ctx, gradSum, gradSquare):
input, = ctx.saved_variables
B, C, H, W = input.data.size()
with torch.cuda.device_of(input.data):
gradInput = Variable(input.data.new().resize_(B, C, H*W).zero_())
gradInput = Variable(input.data.new().resize_as_(input.data).zero_())
if isinstance(input.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Float_sum_square_Backward(
gradInput, input.data.view(B, C, -1), gradSum, gradSquare)
gradInput.data, input.data, gradSum.data, gradSquare.data)
elif isinstance(input.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Double_sum_square_Backward(
gradInput, input.data.view(B, C, -1), gradSum, gradSquare)
gradInput.data, input.data, gradSum.data, gradSquare.data)
else:
raise RuntimeError('Unimplemented data type!')
return gradInput.view(B, C, H, W)
def sum_square(input):
r"""Calculate sum of elements and sum of squares for Batch Normalization"""
return _sum_square.apply(input)
return gradInput
class _batchnorm(Function):
......@@ -134,3 +133,4 @@ def batchnormeval(input, gamma, beta, mean, std):
Please see encoding.batchnormtrain_
"""
return _batchnorm(False)(input, gamma, beta, mean, std)
......@@ -17,6 +17,8 @@ THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
return THCDeviceTensor<real, Dim>();
}
int inDim = THCTensor_(nDimension)(state, t);
return toDeviceTensor<real, Dim>(state, t);
/*
if (inDim == Dim) {
return toDeviceTensor<real, Dim>(state, t);
}
......@@ -33,6 +35,7 @@ THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
}
}
return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
*/
}
struct Encoding_(Float2)
......
......@@ -8,7 +8,7 @@
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#include <THC/THC.h>
#include <THC.h>
#include "THCDeviceTensor.cuh"
#include "THCDeviceTensorUtils.cuh"
......
#!/usr/bin/env bash
mkdir -p encoding/lib && cd encoding/lib
# compile and install
cmake ..
......
......@@ -10,12 +10,14 @@
"""Encoding Custermized NN Module"""
import torch
from torch.nn import Module, Sequential, Conv2d, ReLU, AdaptiveAvgPool2d
from torch.nn import Module, Sequential, Conv2d, ReLU, AdaptiveAvgPool2d, \
NLLLoss, BCELoss, CrossEntropyLoss
from torch.nn import functional as F
from .syncbn import BatchNorm2d
__all__ = ['GramMatrix', 'View', 'Sum', 'Mean', 'Normalize', 'PyramidPooling']
__all__ = ['GramMatrix', 'SegmentationLosses', 'View', 'Sum', 'Mean',
'Normalize', 'PyramidPooling']
class GramMatrix(Module):
......@@ -31,6 +33,46 @@ class GramMatrix(Module):
gram = features.bmm(features_t) / (ch * h * w)
return gram
def softmax_crossentropy(input, target, weight, size_average, ignore_index, reduce=True):
return F.nll_loss(F.log_softmax(input, 1), target, weight,
size_average, ignore_index, reduce)
class SegmentationLosses(CrossEntropyLoss):
"""2D Cross Entropy Loss with Auxilary Loss"""
def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
super(SegmentationLosses, self).__init__(weight, size_average, ignore_index)
self.aux = aux
self.aux_weight = aux_weight
def forward(self, *inputs):
if not self.aux:
return super(SegmentationLosses, self).forward(*inputs)
pred1, pred2, target = tuple(inputs)
loss1 = super(SegmentationLosses, self).forward(pred1, target)
loss2 = super(SegmentationLosses, self).forward(pred2, target)
return loss1 + self.aux_weight * loss2
"""
class SegmentationLosses(Module):
def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
super(SegmentationLosses, self).__init__()
self.aux = aux
self.aux_weight = aux_weight
# Somehow the size averge is not handled correctly on multi-gpu, so we average by ourself.
self.nll_loss = NLLLoss(weight, ignore_index=ignore_index, reduce=True)
def _forward_each(self, inputs, targets):
return self.nll_loss(F.log_softmax(inputs, dim=1), targets)
def forward(self, *inputs):
if not self.aux:
return self._forward_each(*inputs)
pred1, pred2, target = tuple(inputs)
loss1 = self._forward_each(pred1, target)
loss2 = self._forward_each(pred2, target)
return loss1 + self.aux_weight * loss2
"""
class View(Module):
"""Reshape the input into different size, an inplace operator, support
......
......@@ -9,48 +9,63 @@
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Synchronized Cross-GPU Batch Normalization Module"""
import functools
import collections
import threading
import torch
from torch.nn import Module, Sequential, Conv1d, Conv2d, ConvTranspose2d, \
ReLU, Sigmoid, MaxPool2d, AvgPool2d, AdaptiveAvgPool2d, Dropout2d, Linear
ReLU, Sigmoid, MaxPool2d, AvgPool2d, AdaptiveAvgPool2d, Dropout2d, Linear, \
DataParallel
from torch.nn.modules.batchnorm import _BatchNorm
from torch.nn.functional import batch_norm
from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
from ..functions import batchnormtrain, batchnormeval, sum_square
from ..functions import *
from ..parallel import allreduce
__all__ = ['BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'Module', 'Sequential', 'Conv1d',
'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d', 'AvgPool2d',
'AdaptiveAvgPool2d', 'Dropout2d', 'Linear']
class _SyncBatchNorm(_BatchNorm):
# pylint: disable=access-member-before-definition
def __init__(self, num_features, eps=1e-5, momentum=0.1, **kwargs):
super(_SyncBatchNorm, self).__init__(num_features, eps=1e-5, momentum=0.1, **kwargs)
# syncBN
self.writelock = threading.Lock()
nGPUs = torch.cuda.device_count()
self.sharedT = SharedTensor(nGPUs)
def __init__(self, num_features, eps=1e-5, momentum=0.001, affine=True):
super(_SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine)
self._is_parallel = False
self._parallel_id = None
self._slave_pipe = None
self.sharedT = SharedTensor(torch.cuda.device_count())
def forward(self, input):
self._check_input_dim(input)
# Resize the input to (B, C, -1).
input_shape = input.size()
input = input.view(input_shape[0], self.num_features, -1)
if not self.training:
std = (self.running_var.clamp(self.eps)).sqrt()
output = batchnormeval(input, self.weight, self.bias, self.running_mean, std)
return output.view(input_shape)
# get global sum(x) and sum(x^2)
xsum, xsquare = self.sharedT(sum_square(input.unsqueeze(3)))
# sum(x) and sum(x^2)
N = input.size(0) * input.size(2)
xsum, xsqsum = sum_square(input)
# all-reduce for global sum(x) and sum(x^2)
igpu = input.get_device()
self.sharedT.push(N, igpu, xsum, xsqsum)
N, xsum, xsqsum = self.sharedT.pull(igpu)
# calculate mean, var
N = len(self.sharedT) * input.size(0) * input.size(2)
mean = xsum / N
sumvar = xsquare - xsum * xsum / N
sumvar = xsqsum - xsum * xsum / N
unbias_var = sumvar / (N - 1)
bias_var = sumvar / N
std = bias_var.clamp(self.eps).sqrt()
# update running_mean and var
self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * mean.data
self.running_var = (1-self.momentum) * self.running_var + self.momentum * unbias_var.data
# forward
return batchnormtrain(input, self.weight, self.bias, mean, std).view(input_shape)
......@@ -61,6 +76,8 @@ class BatchNorm1d(_SyncBatchNorm):
if input.dim() != 2 and input.dim() != 3:
raise ValueError('expected 2D or 3D input (got {}D input)'
.format(input.dim()))
super(BatchNorm2d, self)._check_input_dim(input)
class BatchNorm2d(_SyncBatchNorm):
r"""Cross-GPU Synchronized Batch normalization (SyncBN)
......@@ -70,6 +87,9 @@ class BatchNorm2d(_SyncBatchNorm):
We follow the sync-onece implmentation described in the paper [2]_ .
Please see the design idea in the `notes <./notes/syncbn.html>`_.
.. note::
Please use ``CUDA_VISIBLE_DEVICES`` to select number of GPUs.
.. math::
y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
......@@ -106,13 +126,16 @@ class BatchNorm2d(_SyncBatchNorm):
Examples:
>>> # Use exactly the same as standard BatchNrom2d
>>> m = nn.BatchNorm2d(100)
>>> output = m(input)
>>> m = BatchNorm2d(100)
>>> net = torch.nn.DataParallel(m)
>>> output = net(input)
"""
def _check_input_dim(self, input):
if input.dim() != 4:
raise ValueError('expected 4D input (got {}D input)'
.format(input.dim()))
super(BatchNorm2d, self)._check_input_dim(input)
class BatchNorm3d(_SyncBatchNorm):
r"""Please see the docs in :class:`encoding.nn.BatchNorm2d`"""
......@@ -120,10 +143,11 @@ class BatchNorm3d(_SyncBatchNorm):
if input.dim() != 5:
raise ValueError('expected 5D input (got {}D input)'
.format(input.dim()))
super(BatchNorm3d, self)._check_input_dim(input)
class SharedTensor(object):
"""Shared Tensor for cross GPU communication
"""
"""Shared Tensor for cross GPU all reduce operation"""
def __init__(self, nGPUs):
self.mutex = threading.Lock()
self.all_tasks_done = threading.Condition(self.mutex)
......@@ -131,28 +155,37 @@ class SharedTensor(object):
self._clear()
def _clear(self):
self.list = []
self.N = 0
self.dict = {}
self.push_tasks = self.nGPUs
self.reduce_tasks = self.nGPUs
def __call__(self, *inputs):
def push(self, *inputs):
if self.nGPUs <= 1:
return tuple(inputs)
# push from device
with self.mutex:
if self.push_tasks == 0:
self._clear()
self.list.extend(list(*inputs))
idx = self.nGPUs - self.push_tasks
self.N += inputs[0]
igpu = inputs[1]
self.dict[igpu] = inputs[2:]
#idx = self.nGPUs - self.push_tasks
self.push_tasks -= 1
with self.all_tasks_done:
if self.push_tasks == 0:
self.all_tasks_done.notify_all()
while self.push_tasks:
self.all_tasks_done.wait()
def pull(self, igpu):
# pull from device
with self.mutex:
if self.reduce_tasks == self.nGPUs:
assert(len(self.list) == 2 * self.nGPUs)
self.list = allreduce(2, *self.list)
if igpu == 0:
assert(len(self.dict) == self.nGPUs)
# flatten the tensors
self.list = [t for i in range(len(self.dict)) for t in self.dict[i]]
self.outlist = allreduce(2, *self.list)
self.reduce_tasks -= 1
else:
self.reduce_tasks -= 1
......@@ -162,10 +195,11 @@ class SharedTensor(object):
while self.reduce_tasks:
self.all_tasks_done.wait()
# all reduce done
return self.list[2*idx], self.list[2*idx+1]
return self.N, self.outlist[2*igpu], self.outlist[2*igpu+1]
def __len__(self):
return self.nGPUs
def __repr__(self):
return ('SharedTensor')
......@@ -11,31 +11,50 @@
"""Encoding Data Parallel"""
import threading
import torch
from torch.autograd import Function
from torch.autograd import Variable, Function
import torch.cuda.comm as comm
from torch.nn.parallel.data_parallel import DataParallel
from torch.nn.parallel.replicate import replicate
from torch.nn.parallel.parallel_apply import get_a_var
from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
__all__ = ['allreduce', 'DataParallelModel', 'DataParallelCriterion']
torch_ver = torch.__version__[:3]
def allreduce(num_inputs, *inputs):
def allreduce(*inputs):
"""Cross GPU all reduce autograd operation for calculate mean and
variance in SyncBN.
"""
target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
result = ReduceAddCoalesced.apply(target_gpus[0], num_inputs, *inputs)
outputs = Broadcast.apply(target_gpus, *result)
assert len(outputs) == len(inputs)
return outputs
return AllReduce.apply(*inputs)
class AllReduce(Function):
@staticmethod
def forward(ctx, num_inputs, *inputs):
ctx.num_inputs = num_inputs
ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
inputs = [inputs[i:i + num_inputs]
for i in range(0, len(inputs), num_inputs)]
# sort before reduce sum
inputs = sorted(inputs, key=lambda i: i[0].get_device())
results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
return tuple([t for tensors in outputs for t in tensors])
@staticmethod
def backward(ctx, *inputs):
inputs = [i.data for i in inputs]
inputs = [inputs[i:i + ctx.num_inputs]
for i in range(0, len(inputs), ctx.num_inputs)]
results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
class Reduce(Function):
@staticmethod
def forward(ctx, *inputs):
ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
inputs = sorted(inputs, key=lambda i: i.get_device())
return comm.reduce_add(inputs)
@staticmethod
......@@ -101,12 +120,13 @@ class DataParallelCriterion(DataParallel):
# scattering the targets instead
if not self.device_ids:
return self.module(inputs, *targets, **kwargs)
targets, kwargs = inputs(targets, kwargs, self.device_ids)
targets, kwargs = self.scatter(targets, kwargs, self.device_ids)
if len(self.device_ids) == 1:
return self.module(inputs, *targets[0], **kwargs[0])
replicas = replicate(self.module, self.device_ids[:len(inputs)])
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
outputs = _criterion_parallel_apply(replicas, inputs, targets, kwargs)
return Reduce.apply(*outputs) / len(outputs)
#return self.gather(outputs, self.output_device).mean()
def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices=None):
......@@ -123,14 +143,16 @@ def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices
lock = threading.Lock()
results = {}
grad_enabled = torch.is_grad_enabled()
if torch_ver != "0.3":
grad_enabled = torch.is_grad_enabled()
def _worker(i, module, input, target, kwargs, device=None):
torch.set_grad_enabled(grad_enabled)
if torch_ver != "0.3":
torch.set_grad_enabled(grad_enabled)
if device is None:
device = get_a_var(input).get_device()
try:
with torch.cuda.device_of(var_input):
with torch.cuda.device(device):
output = module(*(input + target), **kwargs)
with lock:
results[i] = output
......@@ -142,7 +164,7 @@ def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices
threads = [threading.Thread(target=_worker,
args=(i, module, input, target,
kwargs, device),)
for i, (module, input, target, kwargs) in
for i, (module, input, target, kwargs, device) in
enumerate(zip(modules, inputs, targets, kwargs_tup, devices))]
for thread in threads:
......
......@@ -9,36 +9,18 @@
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Encoding Util Tools"""
import shutil
import os
import errno
import requests
import shutil
import hashlib
import math
from tqdm import tqdm
import numpy as np
import torch
__all__ = ['get_optimizer', 'LR_Scheduler', 'save_checkpoint']
def get_optimizer(args, model, diff_LR=True):
"""
Returns an optimizer for given model,
Args:
args: :attr:`args.lr`, :attr:`args.momentum`, :attr:`args.weight_decay`
model: if using different lr, define `model.pretrained` and `model.head`.
"""
if diff_LR and model.pretrained is not None:
print('Using different learning rate for pre-trained features')
optimizer = torch.optim.SGD([
{'params': model.pretrained.parameters()},
{'params': model.head.parameters(),
'lr': args.lr*10},
],
lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
else:
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
return optimizer
__all__ = ['LR_Scheduler', 'save_checkpoint', 'batch_pix_accuracy',
'batch_intersection_union', 'download', 'mkdir', 'check_sha1']
class LR_Scheduler(object):
......@@ -105,3 +87,245 @@ def save_checkpoint(state, args, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, directory + 'model_best.pth.tar')
def batch_pix_accuracy(predict, target):
"""Batch Pixel Accuracy
Args:
predict: input 4D tensor
target: label 3D tensor
"""
_, predict = torch.max(predict, 1)
# pixel_labeled = (target >= 0).sum().item()
# TODO currently torch.eq is not working as expected, change back when it's fixed
# pixel_correct = torch.eq(predict, target).sum().item()
predict = predict.cpu().numpy()
target = target.cpu().numpy()
pixel_labeled = np.sum(target >= 0)
pixel_correct = np.sum((predict == target)*(target >= 0))
assert(pixel_correct <= pixel_labeled)
return pixel_correct, pixel_labeled
def batch_intersection_union(predict, target, nclass):
"""Batch Intersection of Union
Args:
predict: input 4D tensor
target: label 3D tensor
nclass: number of categories (int)
"""
_, predict = torch.max(predict, 1)
mini = 0
maxi = nclass - 1
nbins = nclass
"""
predict = predict.cpu().numpy()
target = target.cpu().numpy()
predict = predict * (target >= 0).astype(predict.dtype)
intersection = predict * (predict == target)
# areas of intersection and union
area_inter, _ = np.histogram(intersection, bins=nbins,
range=(mini, maxi))
area_pred, _ = np.histogram(predict, bins=nbins,
range=(mini, maxi))
area_lab, _ = np.histogram(target, bins=nbins,
range=(mini, maxi))
area_union = area_pred + area_lab - area_inter
# Somehow PyTorch update break this, will change back if fixed
"""
predict = predict * (target >= 0).type_as(predict)
intersection = predict * (predict == target).type_as(predict)
area_inter = torch.histc(intersection.cpu().float(), bins=nclass,
min=mini, max=maxi)
area_pred = torch.histc(predict.cpu().float(), bins=nclass, min=mini,
max=maxi)
area_lab = torch.histc(target.cpu().float(), bins=nclass, min=mini,
max=maxi)
area_union = area_pred + area_lab - area_inter
return area_inter, area_union
def get_selabel_vector(target, nclass):
"""Get SE-Loss Label in a batch
Args:
predict: input 4D tensor
target: label 3D tensor (BxHxW)
nclass: number of categories (int)
Output:
2D tensor (BxnClass)
"""
batch = target.size(0)
tvect = torch.zeros(batch, nclass)
for i in range(batch):
hist = torch.histc(target[i].data.float(),
bins=nclass, min=0,
max=nclass-1)
vect = hist>0
tvect[i] = vect
return tvect
def get_mask_pallete(npimg, dataset='detail'):
"""Get image color pallete for visualizing masks"""
# recovery boundary
if dataset == 'pascal_voc':
npimg[npimg==21] = 255
# put colormap
out_img = Image.fromarray(npimg.astype('uint8'))
if dataset == 'ade20k':
out_img.putpalette(adepallete)
elif dataset == 'cityscapes':
out_img.putpalette(citypallete)
else:
out_img.putpalette(vocpallete)
return out_img
def download(url, path=None, overwrite=False, sha1_hash=None):
"""Download an given URL
Parameters
----------
url : str
URL to download
path : str, optional
Destination path to store downloaded file. By default stores to the
current directory with same name as in url.
overwrite : bool, optional
Whether to overwrite destination file if already exists.
sha1_hash : str, optional
Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
but doesn't match.
Returns
-------
str
The file path of the downloaded file.
"""
if path is None:
fname = url.split('/')[-1]
else:
path = os.path.expanduser(path)
if os.path.isdir(path):
fname = os.path.join(path, url.split('/')[-1])
else:
fname = path
if overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)):
dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
if not os.path.exists(dirname):
os.makedirs(dirname)
print('Downloading %s from %s...'%(fname, url))
r = requests.get(url, stream=True)
if r.status_code != 200:
raise RuntimeError("Failed downloading url %s"%url)
total_length = r.headers.get('content-length')
with open(fname, 'wb') as f:
if total_length is None: # no content length header
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
else:
total_length = int(total_length)
for chunk in tqdm(r.iter_content(chunk_size=1024),
total=int(total_length / 1024. + 0.5),
unit='KB', unit_scale=False, dynamic_ncols=True):
f.write(chunk)
if sha1_hash and not check_sha1(fname, sha1_hash):
raise UserWarning('File {} is downloaded but the content hash does not match. ' \
'The repo may be outdated or download may be incomplete. ' \
'If the "repo_url" is overridden, consider switching to ' \
'the default repo.'.format(fname))
return fname
def check_sha1(filename, sha1_hash):
"""Check whether the sha1 hash of the file content matches the expected hash.
Parameters
----------
filename : str
Path to the file.
sha1_hash : str
Expected sha1 hash in hexadecimal digits.
Returns
-------
bool
Whether the file content matches the expected hash.
"""
sha1 = hashlib.sha1()
with open(filename, 'rb') as f:
while True:
data = f.read(1048576)
if not data:
break
sha1.update(data)
return sha1.hexdigest() == sha1_hash
def mkdir(path):
"""make dir exists okay"""
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
# ref https://github.com/CSAILVision/sceneparsing/blob/master/evaluationCode/utils_eval.py
def pixel_accuracy(im_pred, im_lab):
im_pred = np.asarray(im_pred)
im_lab = np.asarray(im_lab)
# Remove classes from unlabeled pixels in gt image.
# We should not penalize detections in unlabeled portions of the image.
pixel_labeled = np.sum(im_lab > 0)
pixel_correct = np.sum((im_pred == im_lab) * (im_lab > 0))
#pixel_accuracy = 1.0 * pixel_correct / pixel_labeled
return pixel_correct, pixel_labeled
def intersection_and_union(im_pred, im_lab, num_class):
im_pred = np.asarray(im_pred)
im_lab = np.asarray(im_lab)
# Remove classes from unlabeled pixels in gt image.
im_pred = im_pred * (im_lab > 0)
# Compute area intersection:
intersection = im_pred * (im_pred == im_lab)
area_inter, _ = np.histogram(intersection, bins=num_class-1,
range=(1, num_class - 1))
# Compute area union:
area_pred, _ = np.histogram(im_pred, bins=num_class-1,
range=(1, num_class - 1))
area_lab, _ = np.histogram(im_lab, bins=num_class-1,
range=(1, num_class - 1))
area_union = area_pred + area_lab - area_inter
return area_inter, area_union
def _get_voc_pallete(num_cls):
n = num_cls
pallete = [0]*(n*3)
for j in range(0,n):
lab = j
pallete[j*3+0] = 0
pallete[j*3+1] = 0
pallete[j*3+2] = 0
i = 0
while (lab > 0):
pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i))
pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i))
pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i))
i = i + 1
lab >>= 3
return pallete
vocpallete = _get_voc_pallete(256)
adepallete = [0,0,0,120,120,120,180,120,120,6,230,230,80,50,50,4,200,3,120,120,80,140,140,140,204,5,255,230,230,230,4,250,7,224,5,255,235,255,7,150,5,61,120,120,70,8,255,51,255,6,82,143,255,140,204,255,4,255,51,7,204,70,3,0,102,200,61,230,250,255,6,51,11,102,255,255,7,71,255,9,224,9,7,230,220,220,220,255,9,92,112,9,255,8,255,214,7,255,224,255,184,6,10,255,71,255,41,10,7,255,255,224,255,8,102,8,255,255,61,6,255,194,7,255,122,8,0,255,20,255,8,41,255,5,153,6,51,255,235,12,255,160,150,20,0,163,255,140,140,140,250,10,15,20,255,0,31,255,0,255,31,0,255,224,0,153,255,0,0,0,255,255,71,0,0,235,255,0,173,255,31,0,255,11,200,200,255,82,0,0,255,245,0,61,255,0,255,112,0,255,133,255,0,0,255,163,0,255,102,0,194,255,0,0,143,255,51,255,0,0,82,255,0,255,41,0,255,173,10,0,255,173,255,0,0,255,153,255,92,0,255,0,255,255,0,245,255,0,102,255,173,0,255,0,20,255,184,184,0,31,255,0,255,61,0,71,255,255,0,204,0,255,194,0,255,82,0,10,255,0,112,255,51,0,255,0,194,255,0,122,255,0,255,163,255,153,0,0,255,10,255,112,0,143,255,0,82,0,255,163,255,0,255,235,0,8,184,170,133,0,255,0,255,92,184,0,255,255,0,31,0,184,255,0,214,255,255,0,112,92,255,0,0,224,255,112,224,255,70,184,160,163,0,255,153,0,255,71,255,0,255,0,163,255,204,0,255,0,143,0,255,235,133,255,0,255,0,235,245,0,255,255,0,122,255,245,0,10,190,212,214,255,0,0,204,255,20,0,255,255,255,0,0,153,255,0,41,255,0,255,204,41,0,255,41,255,0,173,0,255,0,245,255,71,0,255,122,0,255,0,255,184,0,92,255,184,255,0,0,133,255,255,214,0,25,194,194,102,255,0,92,0,255]
citypallete = [
128,64,128,244,35,232,70,70,70,102,102,156,190,153,153,153,153,153,250,170,30,220,220,0,107,142,35,152,251,152,70,130,180,220,20,60,255,0,0,0,0,142,0,0,70,0,60,100,0,80,100,0,0,230,119,11,32,128,192,0,0,64,128,128,64,128,0,192,128,128,192,128,64,64,0,192,64,0,64,192,0,192,192,0,64,64,128,192,64,128,64,192,128,192,192,128,0,0,64,128,0,64,0,128,64,128,128,64,0,0,192,128,0,192,0,128,192,128,128,192,64,0,64,192,0,64,64,128,64,192,128,64,64,0,192,192,0,192,64,128,192,192,128,192,0,64,64,128,64,64,0,192,64,128,192,64,0,64,192,128,64,192,0,192,192,128,192,192,64,64,64,192,64,64,64,192,64,192,192,64,64,64,192,192,64,192,64,192,192,192,192,192,32,0,0,160,0,0,32,128,0,160,128,0,32,0,128,160,0,128,32,128,128,160,128,128,96,0,0,224,0,0,96,128,0,224,128,0,96,0,128,224,0,128,96,128,128,224,128,128,32,64,0,160,64,0,32,192,0,160,192,0,32,64,128,160,64,128,32,192,128,160,192,128,96,64,0,224,64,0,96,192,0,224,192,0,96,64,128,224,64,128,96,192,128,224,192,128,32,0,64,160,0,64,32,128,64,160,128,64,32,0,192,160,0,192,32,128,192,160,128,192,96,0,64,224,0,64,96,128,64,224,128,64,96,0,192,224,0,192,96,128,192,224,128,192,32,64,64,160,64,64,32,192,64,160,192,64,32,64,192,160,64,192,32,192,192,160,192,192,96,64,64,224,64,64,96,192,64,224,192,64,96,64,192,224,64,192,96,192,192,224,192,192,0,32,0,128,32,0,0,160,0,128,160,0,0,32,128,128,32,128,0,160,128,128,160,128,64,32,0,192,32,0,64,160,0,192,160,0,64,32,128,192,32,128,64,160,128,192,160,128,0,96,0,128,96,0,0,224,0,128,224,0,0,96,128,128,96,128,0,224,128,128,224,128,64,96,0,192,96,0,64,224,0,192,224,0,64,96,128,192,96,128,64,224,128,192,224,128,0,32,64,128,32,64,0,160,64,128,160,64,0,32,192,128,32,192,0,160,192,128,160,192,64,32,64,192,32,64,64,160,64,192,160,64,64,32,192,192,32,192,64,160,192,192,160,192,0,96,64,128,96,64,0,224,64,128,224,64,0,96,192,128,96,192,0,224,192,128,224,192,64,96,64,192,96,64,64,224,64,192,224,64,64,96,192,192,96,192,64,224,192,192,224,192,32,32,0,160,32,0,32,160,0,160,160,0,32,32,128,160,32,128,32,160,128,160,160,128,96,32,0,224,32,0,96,160,0,224,160,0,96,32,128,224,32,128,96,160,128,224,160,128,32,96,0,160,96,0,32,224,0,160,224,0,32,96,128,160,96,128,32,224,128,160,224,128,96,96,0,224,96,0,96,224,0,224,224,0,96,96,128,224,96,128,96,224,128,224,224,128,32,32,64,160,32,64,32,160,64,160,160,64,32,32,192,160,32,192,32,160,192,160,160,192,96,32,64,224,32,64,96,160,64,224,160,64,96,32,192,224,32,192,96,160,192,224,160,192,32,96,64,160,96,64,32,224,64,160,224,64,32,96,192,160,96,192,32,224,192,160,224,192,96,96,64,224,96,64,96,224,64,224,224,64,96,96,192,224,96,192,96,224,192,0,0,0]
......@@ -53,7 +53,9 @@ def main():
print(model)
# criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = get_optimizer(args, model, False)
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
if args.cuda:
model.cuda()
# Please use CUDA_VISIBLE_DEVICES to control the number of gpus
......
......@@ -33,7 +33,7 @@ class install(setuptools.command.install.install):
f.write('"""This is encoding version file."""\n')
f.write("__version__ = '{}'\n".format(version))
version = '0.3.0'
version = '0.4.0'
try:
sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'],
cwd=cwd).decode('ascii').strip()
......@@ -41,22 +41,36 @@ try:
except Exception:
pass
try:
import pypandoc
readme = pypandoc.convert('README.md', 'rst')
except(IOError, ImportError):
readme = open('README.md').read()
requirements = [
'numpy',
'tqdm',
'nose',
'torch>=0.3.1',
'cffi>=1.0.0',
]
setup(
name="encoding",
version=version,
description="PyTorch Encoding",
url="https://github.com/zhanghang1989/PyTorch-Encoding",
author="Hang Zhang",
author_email="zhang.hang@rutgers.edu",
# Require cffi.
install_requires=["cffi>=1.0.0"],
setup_requires=["cffi>=1.0.0"],
# Exclude the build files.
packages=find_packages(exclude=["build"]),
# Package where to put the extensions. Has to be a prefix of build.py.
package_data={'encoding': [
author_email="zhanghang0704@gmail.com",
url="https://github.com/zhanghang1989/PyTorch-Encoding",
description="PyTorch Encoding Package",
long_description=readme,
license='MIT',
install_requires=requirements,
packages=find_packages(exclude=["tests", "experiments"]),
package_data={ 'encoding': [
'lib/*.so*', 'lib/*.dylib*',
'_ext/encoding_lib/*.so', '_ext/encoding_lib/*.dylib',
'kernel/*.h', 'kernel/generic/*h',
'src/*.h',
]},
ext_package="",
# Extensions to compile.
......
......@@ -8,16 +8,21 @@
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import encoding
import unittest
import numpy as np
import torch
import torch.nn.functional as F
from torch.autograd import Variable, gradcheck
import torchvision.models as models
import encoding
EPS = 1e-6
ATOL = 1e-4
def _assert_tensor_close(a, b, atol=ATOL, rtol=EPS):
npa, npb = a.cpu().numpy(), b.cpu().numpy()
assert np.allclose(npa, npb, atol=atol), \
'Tensor close check failed\n{}\n{}\nadiff={}, rdiff={}'.format(
a, b, np.abs(npa - npb).max(), np.abs((npa - npb) / np.fmax(npa, 1e-5)).max())
def test_aggregate():
B,N,K,D = 2,3,4,5
......@@ -28,7 +33,7 @@ def test_aggregate():
C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5),
requires_grad=True)
input = (A, X, C)
test = gradcheck(encoding.functions.aggregate, input, eps=1e-6, atol=1e-4)
test = gradcheck(encoding.functions.aggregate, input, eps=EPS, atol=ATOL)
print('Testing aggregate(): {}'.format(test))
......@@ -41,7 +46,7 @@ def test_scaledL2():
S = Variable(torch.cuda.DoubleTensor(K).uniform_(-0.5,0.5),
requires_grad=True)
input = (X, C, S)
test = gradcheck(encoding.functions.scaledL2, input, eps=1e-6, atol=1e-4)
test = gradcheck(encoding.functions.scaledL2, input, eps=EPS, atol=ATOL)
print('Testing scaledL2(): {}'.format(test))
......@@ -51,16 +56,16 @@ def test_encoding():
requires_grad=True)
input = (X,)
layer = encoding.nn.Encoding(C,K).double().cuda()
test = gradcheck(layer, input, eps=1e-6, atol=1e-4)
test = gradcheck(layer, input, eps=EPS, atol=ATOL)
print('Testing encoding(): {}'.format(test))
def test_sum_square():
B,C,H,W = 2,3,4,5
X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5),
B,C,H = 2,3,4
X = Variable(torch.cuda.DoubleTensor(B,C,H).uniform_(-0.5,0.5),
requires_grad=True)
input = (X,)
test = gradcheck(encoding.functions.sum_square, input, eps=1e-6, atol=1e-4)
test = gradcheck(encoding.functions.sum_square, input, eps=EPS, atol=ATOL)
print('Testing sum_square(): {}'.format(test))
......@@ -71,6 +76,97 @@ def test_all_reduce():
x.requires_grad = True
Y = encoding.parallel.allreduce(1, *X)
assert (len(X) == len(Y))
for i in range(1, ngpu):
_assert_tensor_close(Y[i].data, Y[0].data)
input = (1, *X)
#test = gradcheck(encoding.parallel.allreduce, input, eps=EPS, atol=ATOL)
test = gradcheck(encoding.parallel.allreduce, input, eps=EPS, atol=ATOL)
def _test_syncbn(train_mode=True):
# generate input
B,C,H,W = 8,3,4,5
X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5),
requires_grad=True)
input = (X,)
# SyncBN using DataParallel
layer = encoding.nn.SyncBatchNorm2d(C)
model = torch.nn.DataParallel(layer).double().cuda()
layer.train(train_mode)
# grad check
test = gradcheck(model, input, eps=EPS, atol=ATOL)
print('Testing SyncBatchNorm2d(): {}'.format(test))
def _test_syncbn_func(train_mode=True):
# generate input
B, C, H = 2, 3, 4
X = Variable(torch.cuda.DoubleTensor(B,C,H).uniform_(-0.5, 0.5),
requires_grad=True)
xsum = Variable(torch.ones(C).double().cuda(), requires_grad=True)
xsqsum = Variable(torch.ones(C).double().cuda(), requires_grad=True)
gamma = Variable(torch.ones(C).double().cuda(), requires_grad=True)
beta = Variable(torch.zeros(C).double().cuda(), requires_grad=True)
gamma.requires_grad=True
beta.requires_grad=True
runningVar = torch.ones(C).double().cuda()
runningMean = torch.zeros(C).double().cuda()
N = B * H
inputs = (X, xsum, xsqsum, gamma, beta, runningMean, runningVar, N, 0.1, 1e-5, train_mode)
# grad check
test = gradcheck(encoding.functions.batchnorm.apply, inputs, eps=EPS, atol=ATOL)
print('Testing batchnorm(): {}'.format(test))
def _checkBatchNormResult(bn1, bn2, input, is_train, cuda=False):
def _find_bn(module):
for m in module.modules():
if isinstance(m, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
encoding.nn.SyncBatchNorm1d, encoding.nn.SyncBatchNorm2d)):
return m
def _syncParameters(bn1, bn2):
bn1.reset_parameters()
bn2.reset_parameters()
if bn1.affine and bn2.affine:
bn2.weight.data.copy_(bn1.weight.data)
bn2.bias.data.copy_(bn1.bias.data)
bn2.running_mean.copy_(bn1.running_mean)
bn2.running_var.copy_(bn1.running_var)
bn1.train(mode=is_train)
bn2.train(mode=is_train)
if cuda:
input = input.cuda()
# using the same values for gamma and beta
_syncParameters(_find_bn(bn1), _find_bn(bn2))
input1 = Variable(input.clone(), requires_grad=True)
output1 = bn1(input1)
input2 = Variable(input.clone(), requires_grad=True)
output2 = bn2(input2)
_assert_tensor_close(input1.data, input2.data)
_assert_tensor_close(output1.data, output2.data)
if not is_train:
return
(output1 ** 2).sum().backward()
(output2 ** 2).sum().backward()
_assert_tensor_close(input1.grad.data, input2.grad.data)
_assert_tensor_close(_find_bn(bn1).running_mean, _find_bn(bn2).running_mean)
_assert_tensor_close(_find_bn(bn1).running_var, _find_bn(bn2).running_var)
def testSyncBN():
bn = torch.nn.BatchNorm2d(10).cuda().double()
sync_bn = encoding.nn.SyncBatchNorm2d(10).double()
sync_bn = torch.nn.DataParallel(sync_bn).cuda()
# check with unsync version
for i in range(10):
_checkBatchNormResult(bn, sync_bn, torch.rand(16, 10, 16, 16).double(), True, cuda=True)
_checkBatchNormResult(bn, sync_bn, torch.rand(16, 10, 16, 16).double(), False, cuda=True)
# gradcheck
_test_syncbn_func(True)
_test_syncbn(True)
if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment