Unverified Commit 67e153dd authored by Hang Zhang's avatar Hang Zhang Committed by GitHub
Browse files

update and fix bugs (#51)

parent 71447e1b
...@@ -15,8 +15,8 @@ import subprocess ...@@ -15,8 +15,8 @@ import subprocess
from torch.utils.ffi import create_extension from torch.utils.ffi import create_extension
lib_path = os.path.join(os.path.dirname(torch.__file__), 'lib') lib_path = os.path.join(os.path.dirname(torch.__file__), 'lib')
cwd = os.path.dirname(os.path.realpath(__file__)) cwd = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'encoding/')
encoding_lib_path = os.path.join(cwd, "encoding", "lib") encoding_lib_path = os.path.join(cwd, "lib")
# clean the build files # clean the build files
clean_cmd = ['bash', 'clean.sh'] clean_cmd = ['bash', 'clean.sh']
...@@ -25,13 +25,13 @@ subprocess.check_call(clean_cmd) ...@@ -25,13 +25,13 @@ subprocess.check_call(clean_cmd)
# build CUDA library # build CUDA library
os.environ['TORCH_BUILD_DIR'] = lib_path os.environ['TORCH_BUILD_DIR'] = lib_path
if platform.system() == 'Darwin': if platform.system() == 'Darwin':
os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.1.dylib') os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.dylib')
ENCODING_LIB = os.path.join(cwd, 'encoding/lib/libENCODING.dylib') ENCODING_LIB = os.path.join(cwd, 'lib/libENCODING.dylib')
else: else:
os.environ['CFLAGS'] = '-std=c99' os.environ['CFLAGS'] = '-std=c99'
os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.so.1') os.environ['TH_LIBRARIES'] = os.path.join(lib_path,'libATen.so')
ENCODING_LIB = os.path.join(cwd, 'encoding/lib/libENCODING.so') ENCODING_LIB = os.path.join(cwd, 'lib/libENCODING.so')
build_all_cmd = ['bash', 'encoding/make.sh'] build_all_cmd = ['bash', 'encoding/make.sh']
subprocess.check_call(build_all_cmd, env=dict(os.environ)) subprocess.check_call(build_all_cmd, env=dict(os.environ))
...@@ -45,9 +45,9 @@ defines = [('WITH_CUDA', None)] ...@@ -45,9 +45,9 @@ defines = [('WITH_CUDA', None)]
with_cuda = True with_cuda = True
include_path = [os.path.join(lib_path, 'include'), include_path = [os.path.join(lib_path, 'include'),
os.path.join(cwd,'encoding/kernel'), os.path.join(cwd,'kernel'),
os.path.join(cwd,'encoding/kernel/include'), os.path.join(cwd,'kernel/include'),
os.path.join(cwd,'encoding/src/')] os.path.join(cwd,'src/')]
def make_relative_rpath(path): def make_relative_rpath(path):
if platform.system() == 'Darwin': if platform.system() == 'Darwin':
...@@ -63,6 +63,7 @@ ffi = create_extension( ...@@ -63,6 +63,7 @@ ffi = create_extension(
define_macros=defines, define_macros=defines,
relative_to=__file__, relative_to=__file__,
with_cuda=with_cuda, with_cuda=with_cuda,
extra_compile_args=["-std=c99"],
include_dirs = include_path, include_dirs = include_path,
extra_link_args = [ extra_link_args = [
make_relative_rpath(lib_path), make_relative_rpath(lib_path),
......
#!/usr/bin/env bash #!/usr/bin/env bash
rm -rf build/ dist/ torch_encoding.egg-info/ encoding/lib/ encoding/_ext/ __pycache__ encoding/__pycache__
rm -rf build/ dist/ encoding.egg-info/ encoding/lib/ encoding/_ext/ __pycache__ encoding/__pycache__
.. role:: hidden .. role:: hidden
:class: hidden-section :class: hidden-section
Dilated Networks encoding.dilated
================ ================
We provide correct dilated pre-trained ResNet and DenseNet (stride of 8) for semantic segmentation. We provide correct dilated pre-trained ResNet and DenseNet (stride of 8) for semantic segmentation.
......
...@@ -4,10 +4,20 @@ ...@@ -4,10 +4,20 @@
encoding.functions encoding.functions
================== ==================
.. automodule:: encoding.functions .. automodule:: encoding.Functions
.. currentmodule:: encoding.functions .. currentmodule:: encoding.functions
:hidden:`batchnorm`
~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: batchnorm
:hidden:`batchnormeval`
~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: batchnormeval
:hidden:`dilatedavgpool2d` :hidden:`dilatedavgpool2d`
~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~
......
...@@ -10,7 +10,7 @@ Created by `Hang Zhang <http://hangzh.com/>`_ ...@@ -10,7 +10,7 @@ Created by `Hang Zhang <http://hangzh.com/>`_
An optimized PyTorch package with CUDA backend. An optimized PyTorch package with CUDA backend.
.. note:: .. note::
Please checkout the PyTorch compatible Synchronized Cross-GPU :class:`encoding.nn.BatchNorm2d` and the `mnist example <https://github.com/zhanghang1989/PyTorch-SyncBatchNorm>`_. PyTorch compatible Synchronized Cross-GPU :class:`encoding.nn.SyncBatchNorm2d` and the `MNIST example <https://github.com/zhanghang1989/PyTorch-SyncBatchNorm>`_.
.. toctree:: .. toctree::
:glob: :glob:
...@@ -30,8 +30,7 @@ An optimized PyTorch package with CUDA backend. ...@@ -30,8 +30,7 @@ An optimized PyTorch package with CUDA backend.
:maxdepth: 1 :maxdepth: 1
:caption: Package Reference :caption: Package Reference
encoding nn
syncbn
parallel parallel
dilated dilated
functions functions
......
...@@ -5,9 +5,9 @@ Install and Citations ...@@ -5,9 +5,9 @@ Install and Citations
Install from Source Install from Source
------------------- -------------------
* Install PyTorch from Source (recommended). Please follow the `PyTorch instructions <https://github.com/pytorch/pytorch#from-source>`_. * Install PyTorch by following the `PyTorch instructions <http://pytorch.org/>`_.
* Install this package * Install from source
- Clone the repo:: - Clone the repo::
...@@ -15,12 +15,10 @@ Install from Source ...@@ -15,12 +15,10 @@ Install from Source
- On Linux:: - On Linux::
pip install -r requirements.txt
python setup.py install python setup.py install
- On Mac OSX:: - On Mac OSX::
pip install -r requirements.txt
MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
Citations Citations
......
.. role:: hidden .. role:: hidden
:class: hidden-section :class: hidden-section
Data Parallel encoding.parallel
============= =================
- Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-balance. We address this issue here by doing Model & CriterionDataParallel. - Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-balance. We address this issue here by doing DataParallel for Model & Criterion.
.. note:: .. note::
This code is provided together with the paper This code is provided together with the paper
...@@ -15,16 +15,16 @@ Data Parallel ...@@ -15,16 +15,16 @@ Data Parallel
.. automodule:: encoding.parallel .. automodule:: encoding.parallel
.. currentmodule:: encoding.parallel .. currentmodule:: encoding.parallel
:hidden:`ModelDataParallel` :hidden:`DataParallelModel`
~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: ModelDataParallel .. autoclass:: DataParallelModel
:members: :members:
:hidden:`CriterionDataParallel` :hidden:`DataParallelCriterion`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: CriterionDataParallel .. autoclass:: DataParallelCriterion
:members: :members:
......
.. role:: hidden .. role:: hidden
:class: hidden-section :class: hidden-section
My PyTorch Utils encoding.utils
================ ==============
Useful util functions. Useful util functions.
.. automodule:: encoding.utils .. automodule:: encoding.utils
.. currentmodule:: encoding.utils .. currentmodule:: encoding.utils
:hidden:`LR_Scheduler`
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: LR_Scheduler
:members:
:hidden:`get_optimizer` :hidden:`get_optimizer`
~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~
...@@ -24,3 +18,13 @@ Useful util functions. ...@@ -24,3 +18,13 @@ Useful util functions.
~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: save_checkpoint .. autofunction:: save_checkpoint
:hidden:`batch_pix_accuracy`
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: batch_pix_accuracy
:hidden:`batch_intersection_union`
~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: batch_intersection_union
"""Dilated ResNet""" """Dilated ResNet"""
import math import math
import torch.utils.model_zoo as model_zoo import torch.utils.model_zoo as model_zoo
from .. import nn #from .. import nn
import torch.nn as nn
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152', 'BasicBlock', 'Bottleneck'] 'resnet152', 'BasicBlock', 'Bottleneck']
...@@ -25,15 +26,16 @@ class BasicBlock(nn.Module): ...@@ -25,15 +26,16 @@ class BasicBlock(nn.Module):
"""ResNet BasicBlock """ResNet BasicBlock
""" """
expansion = 1 expansion = 1
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1): def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, first_dilation=1,
norm_layer=None):
super(BasicBlock, self).__init__() super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, bias=False) padding=dilation, dilation=dilation, bias=False)
self.bn1 = nn.BatchNorm2d(planes) self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True) self.relu = nn.ReLU(inplace=False)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
padding=first_dilation, dilation=first_dilation, bias=False) padding=first_dilation, dilation=first_dilation, bias=False)
self.bn2 = nn.BatchNorm2d(planes) self.bn2 = norm_layer(planes)
self.downsample = downsample self.downsample = downsample
self.stride = stride self.stride = stride
...@@ -62,18 +64,18 @@ class Bottleneck(nn.Module): ...@@ -62,18 +64,18 @@ class Bottleneck(nn.Module):
# pylint: disable=unused-argument # pylint: disable=unused-argument
expansion = 4 expansion = 4
def __init__(self, inplanes, planes, stride=1, dilation=1, def __init__(self, inplanes, planes, stride=1, dilation=1,
downsample=None, first_dilation=1): downsample=None, first_dilation=1, norm_layer=None):
super(Bottleneck, self).__init__() super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes) self.bn1 = norm_layer(planes)
self.conv2 = nn.Conv2d( self.conv2 = nn.Conv2d(
planes, planes, kernel_size=3, stride=stride, planes, planes, kernel_size=3, stride=stride,
padding=dilation, dilation=dilation, bias=False) padding=dilation, dilation=dilation, bias=False)
self.bn2 = nn.BatchNorm2d(planes) self.bn2 = norm_layer(planes)
self.conv3 = nn.Conv2d( self.conv3 = nn.Conv2d(
planes, planes * 4, kernel_size=1, bias=False) planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4) self.bn3 = norm_layer(planes * 4)
self.relu = nn.ReLU(inplace=True) self.relu = nn.ReLU(inplace=False)
self.downsample = downsample self.downsample = downsample
self.dilation = dilation self.dilation = dilation
self.stride = stride self.stride = stride
...@@ -118,18 +120,18 @@ class ResNet(nn.Module): ...@@ -118,18 +120,18 @@ class ResNet(nn.Module):
- Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions." - Yu, Fisher, and Vladlen Koltun. "Multi-scale context aggregation by dilated convolutions."
""" """
# pylint: disable=unused-variable # pylint: disable=unused-variable
def __init__(self, block, layers, num_classes=1000): def __init__(self, block, layers, num_classes=1000, norm_layer=None):
self.inplanes = 64 self.inplanes = 64
super(ResNet, self).__init__() super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False) bias=False)
self.bn1 = nn.BatchNorm2d(64) self.bn1 = norm_layer(64)
self.relu = nn.ReLU(inplace=True) self.relu = nn.ReLU(inplace=False)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0]) self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer)
self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4) self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer)
self.avgpool = nn.AvgPool2d(7) self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes) self.fc = nn.Linear(512 * block.expansion, num_classes)
...@@ -137,32 +139,33 @@ class ResNet(nn.Module): ...@@ -137,32 +139,33 @@ class ResNet(nn.Module):
if isinstance(m, nn.Conv2d): if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n)) m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d): elif isinstance(m, norm_layer):
m.weight.data.fill_(1) m.weight.data.fill_(1)
m.bias.data.zero_() m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1, dilation=1): def _make_layer(self, block, planes, blocks, stride=1, dilation=1, norm_layer=None):
downsample = None downsample = None
if stride != 1 or self.inplanes != planes * block.expansion: if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential( downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion, nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False), kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion), norm_layer(planes * block.expansion),
) )
layers = [] layers = []
if dilation == 1 or dilation == 2: if dilation == 1 or dilation == 2:
layers.append(block(self.inplanes, planes, stride, dilation=1, layers.append(block(self.inplanes, planes, stride, dilation=1,
downsample=downsample, first_dilation=dilation)) downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
elif dilation == 4: elif dilation == 4:
layers.append(block(self.inplanes, planes, stride, dilation=2, layers.append(block(self.inplanes, planes, stride, dilation=2,
downsample=downsample, first_dilation=dilation)) downsample=downsample, first_dilation=dilation, norm_layer=norm_layer))
else: else:
raise RuntimeError("=> unknown dilation size: {}".format(dilation)) raise RuntimeError("=> unknown dilation size: {}".format(dilation))
self.inplanes = planes * block.expansion self.inplanes = planes * block.expansion
for i in range(1, blocks): for i in range(1, blocks):
layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation)) layers.append(block(self.inplanes, planes, dilation=dilation, first_dilation=dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers) return nn.Sequential(*layers)
......
...@@ -8,55 +8,54 @@ ...@@ -8,55 +8,54 @@
## LICENSE file in the root directory of this source tree ## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Synchronized Batch Normalization functions""" """Synchronized Cross-GPU Batch Normalization functions"""
import torch import torch
from torch.autograd import Function, Variable from torch.autograd import Variable, Function
from .._ext import encoding_lib from .._ext import encoding_lib
__all__ = ['sum_square', 'batchnormtrain', 'batchnormeval'] __all__ = ['sum_square', 'batchnormtrain', 'batchnormeval']
def sum_square(input):
r"""Calculate sum of elements and sum of squares for Batch Normalization"""
return _sum_square.apply(input)
class _sum_square(Function): class _sum_square(Function):
@staticmethod @staticmethod
def forward(ctx, input): def forward(ctx, input):
ctx.save_for_backward(input) ctx.save_for_backward(input)
B, C, _, _ = input.size() C = input.size(1)
with torch.cuda.device_of(input): with torch.cuda.device_of(input):
xsum = input.new().resize_(C).zero_() xsum = input.new().resize_(C).zero_()
xsquare = input.new().resize_(C).zero_() xsquare = input.new().resize_(C).zero_()
if isinstance(input, torch.cuda.FloatTensor): if isinstance(input, torch.cuda.FloatTensor):
with torch.cuda.device_of(input): with torch.cuda.device_of(input):
encoding_lib.Encoding_Float_sum_square_Forward( encoding_lib.Encoding_Float_sum_square_Forward(
input.view(B, C, -1), xsum, xsquare) input, xsum, xsquare)
elif isinstance(input, torch.cuda.DoubleTensor): elif isinstance(input, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input): with torch.cuda.device_of(input):
encoding_lib.Encoding_Double_sum_square_Forward( encoding_lib.Encoding_Double_sum_square_Forward(
input.view(B, C, -1), xsum, xsquare) input, xsum, xsquare)
else: else:
raise RuntimeError('Unimplemented data type!') raise RuntimeError('Unimplemented data type!', type(input))
return xsum, xsquare return xsum, xsquare
@staticmethod @staticmethod
def backward(ctx, gradSum, gradSquare): def backward(ctx, gradSum, gradSquare):
input, = ctx.saved_variables input, = ctx.saved_variables
B, C, H, W = input.data.size()
with torch.cuda.device_of(input.data): with torch.cuda.device_of(input.data):
gradInput = Variable(input.data.new().resize_(B, C, H*W).zero_()) gradInput = Variable(input.data.new().resize_as_(input.data).zero_())
if isinstance(input.data, torch.cuda.FloatTensor): if isinstance(input.data, torch.cuda.FloatTensor):
with torch.cuda.device_of(input.data): with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Float_sum_square_Backward( encoding_lib.Encoding_Float_sum_square_Backward(
gradInput, input.data.view(B, C, -1), gradSum, gradSquare) gradInput.data, input.data, gradSum.data, gradSquare.data)
elif isinstance(input.data, torch.cuda.DoubleTensor): elif isinstance(input.data, torch.cuda.DoubleTensor):
with torch.cuda.device_of(input.data): with torch.cuda.device_of(input.data):
encoding_lib.Encoding_Double_sum_square_Backward( encoding_lib.Encoding_Double_sum_square_Backward(
gradInput, input.data.view(B, C, -1), gradSum, gradSquare) gradInput.data, input.data, gradSum.data, gradSquare.data)
else: else:
raise RuntimeError('Unimplemented data type!') raise RuntimeError('Unimplemented data type!')
return gradInput.view(B, C, H, W) return gradInput
def sum_square(input):
r"""Calculate sum of elements and sum of squares for Batch Normalization"""
return _sum_square.apply(input)
class _batchnorm(Function): class _batchnorm(Function):
...@@ -134,3 +133,4 @@ def batchnormeval(input, gamma, beta, mean, std): ...@@ -134,3 +133,4 @@ def batchnormeval(input, gamma, beta, mean, std):
Please see encoding.batchnormtrain_ Please see encoding.batchnormtrain_
""" """
return _batchnorm(False)(input, gamma, beta, mean, std) return _batchnorm(False)(input, gamma, beta, mean, std)
...@@ -17,6 +17,8 @@ THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) { ...@@ -17,6 +17,8 @@ THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
return THCDeviceTensor<real, Dim>(); return THCDeviceTensor<real, Dim>();
} }
int inDim = THCTensor_(nDimension)(state, t); int inDim = THCTensor_(nDimension)(state, t);
return toDeviceTensor<real, Dim>(state, t);
/*
if (inDim == Dim) { if (inDim == Dim) {
return toDeviceTensor<real, Dim>(state, t); return toDeviceTensor<real, Dim>(state, t);
} }
...@@ -33,6 +35,7 @@ THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) { ...@@ -33,6 +35,7 @@ THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
} }
} }
return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size); return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
*/
} }
struct Encoding_(Float2) struct Encoding_(Float2)
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* LICENSE file in the root directory of this source tree * LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/ */
#include <THC/THC.h> #include <THC.h>
#include "THCDeviceTensor.cuh" #include "THCDeviceTensor.cuh"
#include "THCDeviceTensorUtils.cuh" #include "THCDeviceTensorUtils.cuh"
......
#!/usr/bin/env bash #!/usr/bin/env bash
mkdir -p encoding/lib && cd encoding/lib mkdir -p encoding/lib && cd encoding/lib
# compile and install # compile and install
cmake .. cmake ..
......
...@@ -10,12 +10,14 @@ ...@@ -10,12 +10,14 @@
"""Encoding Custermized NN Module""" """Encoding Custermized NN Module"""
import torch import torch
from torch.nn import Module, Sequential, Conv2d, ReLU, AdaptiveAvgPool2d from torch.nn import Module, Sequential, Conv2d, ReLU, AdaptiveAvgPool2d, \
NLLLoss, BCELoss, CrossEntropyLoss
from torch.nn import functional as F from torch.nn import functional as F
from .syncbn import BatchNorm2d from .syncbn import BatchNorm2d
__all__ = ['GramMatrix', 'View', 'Sum', 'Mean', 'Normalize', 'PyramidPooling'] __all__ = ['GramMatrix', 'SegmentationLosses', 'View', 'Sum', 'Mean',
'Normalize', 'PyramidPooling']
class GramMatrix(Module): class GramMatrix(Module):
...@@ -31,6 +33,46 @@ class GramMatrix(Module): ...@@ -31,6 +33,46 @@ class GramMatrix(Module):
gram = features.bmm(features_t) / (ch * h * w) gram = features.bmm(features_t) / (ch * h * w)
return gram return gram
def softmax_crossentropy(input, target, weight, size_average, ignore_index, reduce=True):
return F.nll_loss(F.log_softmax(input, 1), target, weight,
size_average, ignore_index, reduce)
class SegmentationLosses(CrossEntropyLoss):
"""2D Cross Entropy Loss with Auxilary Loss"""
def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
super(SegmentationLosses, self).__init__(weight, size_average, ignore_index)
self.aux = aux
self.aux_weight = aux_weight
def forward(self, *inputs):
if not self.aux:
return super(SegmentationLosses, self).forward(*inputs)
pred1, pred2, target = tuple(inputs)
loss1 = super(SegmentationLosses, self).forward(pred1, target)
loss2 = super(SegmentationLosses, self).forward(pred2, target)
return loss1 + self.aux_weight * loss2
"""
class SegmentationLosses(Module):
def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
super(SegmentationLosses, self).__init__()
self.aux = aux
self.aux_weight = aux_weight
# Somehow the size averge is not handled correctly on multi-gpu, so we average by ourself.
self.nll_loss = NLLLoss(weight, ignore_index=ignore_index, reduce=True)
def _forward_each(self, inputs, targets):
return self.nll_loss(F.log_softmax(inputs, dim=1), targets)
def forward(self, *inputs):
if not self.aux:
return self._forward_each(*inputs)
pred1, pred2, target = tuple(inputs)
loss1 = self._forward_each(pred1, target)
loss2 = self._forward_each(pred2, target)
return loss1 + self.aux_weight * loss2
"""
class View(Module): class View(Module):
"""Reshape the input into different size, an inplace operator, support """Reshape the input into different size, an inplace operator, support
......
...@@ -9,48 +9,63 @@ ...@@ -9,48 +9,63 @@
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Synchronized Cross-GPU Batch Normalization Module""" """Synchronized Cross-GPU Batch Normalization Module"""
import functools
import collections
import threading import threading
import torch import torch
from torch.nn import Module, Sequential, Conv1d, Conv2d, ConvTranspose2d, \ from torch.nn import Module, Sequential, Conv1d, Conv2d, ConvTranspose2d, \
ReLU, Sigmoid, MaxPool2d, AvgPool2d, AdaptiveAvgPool2d, Dropout2d, Linear ReLU, Sigmoid, MaxPool2d, AvgPool2d, AdaptiveAvgPool2d, Dropout2d, Linear, \
DataParallel
from torch.nn.modules.batchnorm import _BatchNorm from torch.nn.modules.batchnorm import _BatchNorm
from torch.nn.functional import batch_norm
from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
from ..functions import batchnormtrain, batchnormeval, sum_square from ..functions import *
from ..parallel import allreduce from ..parallel import allreduce
__all__ = ['BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'Module', 'Sequential', 'Conv1d', __all__ = ['BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'Module', 'Sequential', 'Conv1d',
'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d', 'AvgPool2d', 'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d', 'AvgPool2d',
'AdaptiveAvgPool2d', 'Dropout2d', 'Linear'] 'AdaptiveAvgPool2d', 'Dropout2d', 'Linear']
class _SyncBatchNorm(_BatchNorm): class _SyncBatchNorm(_BatchNorm):
# pylint: disable=access-member-before-definition def __init__(self, num_features, eps=1e-5, momentum=0.001, affine=True):
def __init__(self, num_features, eps=1e-5, momentum=0.1, **kwargs): super(_SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine)
super(_SyncBatchNorm, self).__init__(num_features, eps=1e-5, momentum=0.1, **kwargs)
# syncBN self._is_parallel = False
self.writelock = threading.Lock() self._parallel_id = None
nGPUs = torch.cuda.device_count() self._slave_pipe = None
self.sharedT = SharedTensor(nGPUs) self.sharedT = SharedTensor(torch.cuda.device_count())
def forward(self, input): def forward(self, input):
self._check_input_dim(input) # Resize the input to (B, C, -1).
input_shape = input.size() input_shape = input.size()
input = input.view(input_shape[0], self.num_features, -1) input = input.view(input_shape[0], self.num_features, -1)
if not self.training: if not self.training:
std = (self.running_var.clamp(self.eps)).sqrt() std = (self.running_var.clamp(self.eps)).sqrt()
output = batchnormeval(input, self.weight, self.bias, self.running_mean, std) output = batchnormeval(input, self.weight, self.bias, self.running_mean, std)
return output.view(input_shape) return output.view(input_shape)
# get global sum(x) and sum(x^2)
xsum, xsquare = self.sharedT(sum_square(input.unsqueeze(3))) # sum(x) and sum(x^2)
N = input.size(0) * input.size(2)
xsum, xsqsum = sum_square(input)
# all-reduce for global sum(x) and sum(x^2)
igpu = input.get_device()
self.sharedT.push(N, igpu, xsum, xsqsum)
N, xsum, xsqsum = self.sharedT.pull(igpu)
# calculate mean, var # calculate mean, var
N = len(self.sharedT) * input.size(0) * input.size(2)
mean = xsum / N mean = xsum / N
sumvar = xsquare - xsum * xsum / N sumvar = xsqsum - xsum * xsum / N
unbias_var = sumvar / (N - 1) unbias_var = sumvar / (N - 1)
bias_var = sumvar / N bias_var = sumvar / N
std = bias_var.clamp(self.eps).sqrt() std = bias_var.clamp(self.eps).sqrt()
# update running_mean and var # update running_mean and var
self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * mean.data self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * mean.data
self.running_var = (1-self.momentum) * self.running_var + self.momentum * unbias_var.data self.running_var = (1-self.momentum) * self.running_var + self.momentum * unbias_var.data
# forward # forward
return batchnormtrain(input, self.weight, self.bias, mean, std).view(input_shape) return batchnormtrain(input, self.weight, self.bias, mean, std).view(input_shape)
...@@ -61,6 +76,8 @@ class BatchNorm1d(_SyncBatchNorm): ...@@ -61,6 +76,8 @@ class BatchNorm1d(_SyncBatchNorm):
if input.dim() != 2 and input.dim() != 3: if input.dim() != 2 and input.dim() != 3:
raise ValueError('expected 2D or 3D input (got {}D input)' raise ValueError('expected 2D or 3D input (got {}D input)'
.format(input.dim())) .format(input.dim()))
super(BatchNorm2d, self)._check_input_dim(input)
class BatchNorm2d(_SyncBatchNorm): class BatchNorm2d(_SyncBatchNorm):
r"""Cross-GPU Synchronized Batch normalization (SyncBN) r"""Cross-GPU Synchronized Batch normalization (SyncBN)
...@@ -70,6 +87,9 @@ class BatchNorm2d(_SyncBatchNorm): ...@@ -70,6 +87,9 @@ class BatchNorm2d(_SyncBatchNorm):
We follow the sync-onece implmentation described in the paper [2]_ . We follow the sync-onece implmentation described in the paper [2]_ .
Please see the design idea in the `notes <./notes/syncbn.html>`_. Please see the design idea in the `notes <./notes/syncbn.html>`_.
.. note::
Please use ``CUDA_VISIBLE_DEVICES`` to select number of GPUs.
.. math:: .. math::
y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
...@@ -106,13 +126,16 @@ class BatchNorm2d(_SyncBatchNorm): ...@@ -106,13 +126,16 @@ class BatchNorm2d(_SyncBatchNorm):
Examples: Examples:
>>> # Use exactly the same as standard BatchNrom2d >>> # Use exactly the same as standard BatchNrom2d
>>> m = nn.BatchNorm2d(100) >>> m = BatchNorm2d(100)
>>> output = m(input) >>> net = torch.nn.DataParallel(m)
>>> output = net(input)
""" """
def _check_input_dim(self, input): def _check_input_dim(self, input):
if input.dim() != 4: if input.dim() != 4:
raise ValueError('expected 4D input (got {}D input)' raise ValueError('expected 4D input (got {}D input)'
.format(input.dim())) .format(input.dim()))
super(BatchNorm2d, self)._check_input_dim(input)
class BatchNorm3d(_SyncBatchNorm): class BatchNorm3d(_SyncBatchNorm):
r"""Please see the docs in :class:`encoding.nn.BatchNorm2d`""" r"""Please see the docs in :class:`encoding.nn.BatchNorm2d`"""
...@@ -120,10 +143,11 @@ class BatchNorm3d(_SyncBatchNorm): ...@@ -120,10 +143,11 @@ class BatchNorm3d(_SyncBatchNorm):
if input.dim() != 5: if input.dim() != 5:
raise ValueError('expected 5D input (got {}D input)' raise ValueError('expected 5D input (got {}D input)'
.format(input.dim())) .format(input.dim()))
super(BatchNorm3d, self)._check_input_dim(input)
class SharedTensor(object): class SharedTensor(object):
"""Shared Tensor for cross GPU communication """Shared Tensor for cross GPU all reduce operation"""
"""
def __init__(self, nGPUs): def __init__(self, nGPUs):
self.mutex = threading.Lock() self.mutex = threading.Lock()
self.all_tasks_done = threading.Condition(self.mutex) self.all_tasks_done = threading.Condition(self.mutex)
...@@ -131,28 +155,37 @@ class SharedTensor(object): ...@@ -131,28 +155,37 @@ class SharedTensor(object):
self._clear() self._clear()
def _clear(self): def _clear(self):
self.list = [] self.N = 0
self.dict = {}
self.push_tasks = self.nGPUs self.push_tasks = self.nGPUs
self.reduce_tasks = self.nGPUs self.reduce_tasks = self.nGPUs
def __call__(self, *inputs): def push(self, *inputs):
if self.nGPUs <= 1:
return tuple(inputs)
# push from device # push from device
with self.mutex: with self.mutex:
if self.push_tasks == 0: if self.push_tasks == 0:
self._clear() self._clear()
self.list.extend(list(*inputs)) self.N += inputs[0]
idx = self.nGPUs - self.push_tasks igpu = inputs[1]
self.dict[igpu] = inputs[2:]
#idx = self.nGPUs - self.push_tasks
self.push_tasks -= 1 self.push_tasks -= 1
with self.all_tasks_done: with self.all_tasks_done:
if self.push_tasks == 0: if self.push_tasks == 0:
self.all_tasks_done.notify_all() self.all_tasks_done.notify_all()
while self.push_tasks: while self.push_tasks:
self.all_tasks_done.wait() self.all_tasks_done.wait()
def pull(self, igpu):
# pull from device # pull from device
with self.mutex: with self.mutex:
if self.reduce_tasks == self.nGPUs: if igpu == 0:
assert(len(self.list) == 2 * self.nGPUs) assert(len(self.dict) == self.nGPUs)
self.list = allreduce(2, *self.list) # flatten the tensors
self.list = [t for i in range(len(self.dict)) for t in self.dict[i]]
self.outlist = allreduce(2, *self.list)
self.reduce_tasks -= 1 self.reduce_tasks -= 1
else: else:
self.reduce_tasks -= 1 self.reduce_tasks -= 1
...@@ -162,10 +195,11 @@ class SharedTensor(object): ...@@ -162,10 +195,11 @@ class SharedTensor(object):
while self.reduce_tasks: while self.reduce_tasks:
self.all_tasks_done.wait() self.all_tasks_done.wait()
# all reduce done # all reduce done
return self.list[2*idx], self.list[2*idx+1] return self.N, self.outlist[2*igpu], self.outlist[2*igpu+1]
def __len__(self): def __len__(self):
return self.nGPUs return self.nGPUs
def __repr__(self): def __repr__(self):
return ('SharedTensor') return ('SharedTensor')
...@@ -11,31 +11,50 @@ ...@@ -11,31 +11,50 @@
"""Encoding Data Parallel""" """Encoding Data Parallel"""
import threading import threading
import torch import torch
from torch.autograd import Function from torch.autograd import Variable, Function
import torch.cuda.comm as comm import torch.cuda.comm as comm
from torch.nn.parallel.data_parallel import DataParallel from torch.nn.parallel.data_parallel import DataParallel
from torch.nn.parallel.replicate import replicate
from torch.nn.parallel.parallel_apply import get_a_var from torch.nn.parallel.parallel_apply import get_a_var
from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
__all__ = ['allreduce', 'DataParallelModel', 'DataParallelCriterion'] __all__ = ['allreduce', 'DataParallelModel', 'DataParallelCriterion']
torch_ver = torch.__version__[:3]
def allreduce(num_inputs, *inputs): def allreduce(*inputs):
"""Cross GPU all reduce autograd operation for calculate mean and """Cross GPU all reduce autograd operation for calculate mean and
variance in SyncBN. variance in SyncBN.
""" """
target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)] return AllReduce.apply(*inputs)
result = ReduceAddCoalesced.apply(target_gpus[0], num_inputs, *inputs)
outputs = Broadcast.apply(target_gpus, *result) class AllReduce(Function):
assert len(outputs) == len(inputs) @staticmethod
return outputs def forward(ctx, num_inputs, *inputs):
ctx.num_inputs = num_inputs
ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
inputs = [inputs[i:i + num_inputs]
for i in range(0, len(inputs), num_inputs)]
# sort before reduce sum
inputs = sorted(inputs, key=lambda i: i[0].get_device())
results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
return tuple([t for tensors in outputs for t in tensors])
@staticmethod
def backward(ctx, *inputs):
inputs = [i.data for i in inputs]
inputs = [inputs[i:i + ctx.num_inputs]
for i in range(0, len(inputs), ctx.num_inputs)]
results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
class Reduce(Function): class Reduce(Function):
@staticmethod @staticmethod
def forward(ctx, *inputs): def forward(ctx, *inputs):
ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))] ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
inputs = sorted(inputs, key=lambda i: i.get_device())
return comm.reduce_add(inputs) return comm.reduce_add(inputs)
@staticmethod @staticmethod
...@@ -101,12 +120,13 @@ class DataParallelCriterion(DataParallel): ...@@ -101,12 +120,13 @@ class DataParallelCriterion(DataParallel):
# scattering the targets instead # scattering the targets instead
if not self.device_ids: if not self.device_ids:
return self.module(inputs, *targets, **kwargs) return self.module(inputs, *targets, **kwargs)
targets, kwargs = inputs(targets, kwargs, self.device_ids) targets, kwargs = self.scatter(targets, kwargs, self.device_ids)
if len(self.device_ids) == 1: if len(self.device_ids) == 1:
return self.module(inputs, *targets[0], **kwargs[0]) return self.module(inputs, *targets[0], **kwargs[0])
replicas = replicate(self.module, self.device_ids[:len(inputs)]) replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
outputs = _criterion_parallel_apply(replicas, inputs, targets, kwargs) outputs = _criterion_parallel_apply(replicas, inputs, targets, kwargs)
return Reduce.apply(*outputs) / len(outputs) return Reduce.apply(*outputs) / len(outputs)
#return self.gather(outputs, self.output_device).mean()
def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices=None): def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices=None):
...@@ -123,14 +143,16 @@ def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices ...@@ -123,14 +143,16 @@ def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices
lock = threading.Lock() lock = threading.Lock()
results = {} results = {}
grad_enabled = torch.is_grad_enabled() if torch_ver != "0.3":
grad_enabled = torch.is_grad_enabled()
def _worker(i, module, input, target, kwargs, device=None): def _worker(i, module, input, target, kwargs, device=None):
torch.set_grad_enabled(grad_enabled) if torch_ver != "0.3":
torch.set_grad_enabled(grad_enabled)
if device is None: if device is None:
device = get_a_var(input).get_device() device = get_a_var(input).get_device()
try: try:
with torch.cuda.device_of(var_input): with torch.cuda.device(device):
output = module(*(input + target), **kwargs) output = module(*(input + target), **kwargs)
with lock: with lock:
results[i] = output results[i] = output
...@@ -142,7 +164,7 @@ def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices ...@@ -142,7 +164,7 @@ def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices
threads = [threading.Thread(target=_worker, threads = [threading.Thread(target=_worker,
args=(i, module, input, target, args=(i, module, input, target,
kwargs, device),) kwargs, device),)
for i, (module, input, target, kwargs) in for i, (module, input, target, kwargs, device) in
enumerate(zip(modules, inputs, targets, kwargs_tup, devices))] enumerate(zip(modules, inputs, targets, kwargs_tup, devices))]
for thread in threads: for thread in threads:
......
...@@ -9,36 +9,18 @@ ...@@ -9,36 +9,18 @@
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Encoding Util Tools""" """Encoding Util Tools"""
import shutil
import os import os
import errno
import requests
import shutil
import hashlib
import math import math
from tqdm import tqdm
import numpy as np
import torch import torch
__all__ = ['get_optimizer', 'LR_Scheduler', 'save_checkpoint'] __all__ = ['LR_Scheduler', 'save_checkpoint', 'batch_pix_accuracy',
'batch_intersection_union', 'download', 'mkdir', 'check_sha1']
def get_optimizer(args, model, diff_LR=True):
"""
Returns an optimizer for given model,
Args:
args: :attr:`args.lr`, :attr:`args.momentum`, :attr:`args.weight_decay`
model: if using different lr, define `model.pretrained` and `model.head`.
"""
if diff_LR and model.pretrained is not None:
print('Using different learning rate for pre-trained features')
optimizer = torch.optim.SGD([
{'params': model.pretrained.parameters()},
{'params': model.head.parameters(),
'lr': args.lr*10},
],
lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
else:
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
return optimizer
class LR_Scheduler(object): class LR_Scheduler(object):
...@@ -105,3 +87,245 @@ def save_checkpoint(state, args, is_best, filename='checkpoint.pth.tar'): ...@@ -105,3 +87,245 @@ def save_checkpoint(state, args, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename) torch.save(state, filename)
if is_best: if is_best:
shutil.copyfile(filename, directory + 'model_best.pth.tar') shutil.copyfile(filename, directory + 'model_best.pth.tar')
def batch_pix_accuracy(predict, target):
"""Batch Pixel Accuracy
Args:
predict: input 4D tensor
target: label 3D tensor
"""
_, predict = torch.max(predict, 1)
# pixel_labeled = (target >= 0).sum().item()
# TODO currently torch.eq is not working as expected, change back when it's fixed
# pixel_correct = torch.eq(predict, target).sum().item()
predict = predict.cpu().numpy()
target = target.cpu().numpy()
pixel_labeled = np.sum(target >= 0)
pixel_correct = np.sum((predict == target)*(target >= 0))
assert(pixel_correct <= pixel_labeled)
return pixel_correct, pixel_labeled
def batch_intersection_union(predict, target, nclass):
"""Batch Intersection of Union
Args:
predict: input 4D tensor
target: label 3D tensor
nclass: number of categories (int)
"""
_, predict = torch.max(predict, 1)
mini = 0
maxi = nclass - 1
nbins = nclass
"""
predict = predict.cpu().numpy()
target = target.cpu().numpy()
predict = predict * (target >= 0).astype(predict.dtype)
intersection = predict * (predict == target)
# areas of intersection and union
area_inter, _ = np.histogram(intersection, bins=nbins,
range=(mini, maxi))
area_pred, _ = np.histogram(predict, bins=nbins,
range=(mini, maxi))
area_lab, _ = np.histogram(target, bins=nbins,
range=(mini, maxi))
area_union = area_pred + area_lab - area_inter
# Somehow PyTorch update break this, will change back if fixed
"""
predict = predict * (target >= 0).type_as(predict)
intersection = predict * (predict == target).type_as(predict)
area_inter = torch.histc(intersection.cpu().float(), bins=nclass,
min=mini, max=maxi)
area_pred = torch.histc(predict.cpu().float(), bins=nclass, min=mini,
max=maxi)
area_lab = torch.histc(target.cpu().float(), bins=nclass, min=mini,
max=maxi)
area_union = area_pred + area_lab - area_inter
return area_inter, area_union
def get_selabel_vector(target, nclass):
"""Get SE-Loss Label in a batch
Args:
predict: input 4D tensor
target: label 3D tensor (BxHxW)
nclass: number of categories (int)
Output:
2D tensor (BxnClass)
"""
batch = target.size(0)
tvect = torch.zeros(batch, nclass)
for i in range(batch):
hist = torch.histc(target[i].data.float(),
bins=nclass, min=0,
max=nclass-1)
vect = hist>0
tvect[i] = vect
return tvect
def get_mask_pallete(npimg, dataset='detail'):
"""Get image color pallete for visualizing masks"""
# recovery boundary
if dataset == 'pascal_voc':
npimg[npimg==21] = 255
# put colormap
out_img = Image.fromarray(npimg.astype('uint8'))
if dataset == 'ade20k':
out_img.putpalette(adepallete)
elif dataset == 'cityscapes':
out_img.putpalette(citypallete)
else:
out_img.putpalette(vocpallete)
return out_img
def download(url, path=None, overwrite=False, sha1_hash=None):
"""Download an given URL
Parameters
----------
url : str
URL to download
path : str, optional
Destination path to store downloaded file. By default stores to the
current directory with same name as in url.
overwrite : bool, optional
Whether to overwrite destination file if already exists.
sha1_hash : str, optional
Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
but doesn't match.
Returns
-------
str
The file path of the downloaded file.
"""
if path is None:
fname = url.split('/')[-1]
else:
path = os.path.expanduser(path)
if os.path.isdir(path):
fname = os.path.join(path, url.split('/')[-1])
else:
fname = path
if overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)):
dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
if not os.path.exists(dirname):
os.makedirs(dirname)
print('Downloading %s from %s...'%(fname, url))
r = requests.get(url, stream=True)
if r.status_code != 200:
raise RuntimeError("Failed downloading url %s"%url)
total_length = r.headers.get('content-length')
with open(fname, 'wb') as f:
if total_length is None: # no content length header
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
else:
total_length = int(total_length)
for chunk in tqdm(r.iter_content(chunk_size=1024),
total=int(total_length / 1024. + 0.5),
unit='KB', unit_scale=False, dynamic_ncols=True):
f.write(chunk)
if sha1_hash and not check_sha1(fname, sha1_hash):
raise UserWarning('File {} is downloaded but the content hash does not match. ' \
'The repo may be outdated or download may be incomplete. ' \
'If the "repo_url" is overridden, consider switching to ' \
'the default repo.'.format(fname))
return fname
def check_sha1(filename, sha1_hash):
"""Check whether the sha1 hash of the file content matches the expected hash.
Parameters
----------
filename : str
Path to the file.
sha1_hash : str
Expected sha1 hash in hexadecimal digits.
Returns
-------
bool
Whether the file content matches the expected hash.
"""
sha1 = hashlib.sha1()
with open(filename, 'rb') as f:
while True:
data = f.read(1048576)
if not data:
break
sha1.update(data)
return sha1.hexdigest() == sha1_hash
def mkdir(path):
"""make dir exists okay"""
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
# ref https://github.com/CSAILVision/sceneparsing/blob/master/evaluationCode/utils_eval.py
def pixel_accuracy(im_pred, im_lab):
im_pred = np.asarray(im_pred)
im_lab = np.asarray(im_lab)
# Remove classes from unlabeled pixels in gt image.
# We should not penalize detections in unlabeled portions of the image.
pixel_labeled = np.sum(im_lab > 0)
pixel_correct = np.sum((im_pred == im_lab) * (im_lab > 0))
#pixel_accuracy = 1.0 * pixel_correct / pixel_labeled
return pixel_correct, pixel_labeled
def intersection_and_union(im_pred, im_lab, num_class):
im_pred = np.asarray(im_pred)
im_lab = np.asarray(im_lab)
# Remove classes from unlabeled pixels in gt image.
im_pred = im_pred * (im_lab > 0)
# Compute area intersection:
intersection = im_pred * (im_pred == im_lab)
area_inter, _ = np.histogram(intersection, bins=num_class-1,
range=(1, num_class - 1))
# Compute area union:
area_pred, _ = np.histogram(im_pred, bins=num_class-1,
range=(1, num_class - 1))
area_lab, _ = np.histogram(im_lab, bins=num_class-1,
range=(1, num_class - 1))
area_union = area_pred + area_lab - area_inter
return area_inter, area_union
def _get_voc_pallete(num_cls):
n = num_cls
pallete = [0]*(n*3)
for j in range(0,n):
lab = j
pallete[j*3+0] = 0
pallete[j*3+1] = 0
pallete[j*3+2] = 0
i = 0
while (lab > 0):
pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i))
pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i))
pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i))
i = i + 1
lab >>= 3
return pallete
vocpallete = _get_voc_pallete(256)
adepallete = [0,0,0,120,120,120,180,120,120,6,230,230,80,50,50,4,200,3,120,120,80,140,140,140,204,5,255,230,230,230,4,250,7,224,5,255,235,255,7,150,5,61,120,120,70,8,255,51,255,6,82,143,255,140,204,255,4,255,51,7,204,70,3,0,102,200,61,230,250,255,6,51,11,102,255,255,7,71,255,9,224,9,7,230,220,220,220,255,9,92,112,9,255,8,255,214,7,255,224,255,184,6,10,255,71,255,41,10,7,255,255,224,255,8,102,8,255,255,61,6,255,194,7,255,122,8,0,255,20,255,8,41,255,5,153,6,51,255,235,12,255,160,150,20,0,163,255,140,140,140,250,10,15,20,255,0,31,255,0,255,31,0,255,224,0,153,255,0,0,0,255,255,71,0,0,235,255,0,173,255,31,0,255,11,200,200,255,82,0,0,255,245,0,61,255,0,255,112,0,255,133,255,0,0,255,163,0,255,102,0,194,255,0,0,143,255,51,255,0,0,82,255,0,255,41,0,255,173,10,0,255,173,255,0,0,255,153,255,92,0,255,0,255,255,0,245,255,0,102,255,173,0,255,0,20,255,184,184,0,31,255,0,255,61,0,71,255,255,0,204,0,255,194,0,255,82,0,10,255,0,112,255,51,0,255,0,194,255,0,122,255,0,255,163,255,153,0,0,255,10,255,112,0,143,255,0,82,0,255,163,255,0,255,235,0,8,184,170,133,0,255,0,255,92,184,0,255,255,0,31,0,184,255,0,214,255,255,0,112,92,255,0,0,224,255,112,224,255,70,184,160,163,0,255,153,0,255,71,255,0,255,0,163,255,204,0,255,0,143,0,255,235,133,255,0,255,0,235,245,0,255,255,0,122,255,245,0,10,190,212,214,255,0,0,204,255,20,0,255,255,255,0,0,153,255,0,41,255,0,255,204,41,0,255,41,255,0,173,0,255,0,245,255,71,0,255,122,0,255,0,255,184,0,92,255,184,255,0,0,133,255,255,214,0,25,194,194,102,255,0,92,0,255]
citypallete = [
128,64,128,244,35,232,70,70,70,102,102,156,190,153,153,153,153,153,250,170,30,220,220,0,107,142,35,152,251,152,70,130,180,220,20,60,255,0,0,0,0,142,0,0,70,0,60,100,0,80,100,0,0,230,119,11,32,128,192,0,0,64,128,128,64,128,0,192,128,128,192,128,64,64,0,192,64,0,64,192,0,192,192,0,64,64,128,192,64,128,64,192,128,192,192,128,0,0,64,128,0,64,0,128,64,128,128,64,0,0,192,128,0,192,0,128,192,128,128,192,64,0,64,192,0,64,64,128,64,192,128,64,64,0,192,192,0,192,64,128,192,192,128,192,0,64,64,128,64,64,0,192,64,128,192,64,0,64,192,128,64,192,0,192,192,128,192,192,64,64,64,192,64,64,64,192,64,192,192,64,64,64,192,192,64,192,64,192,192,192,192,192,32,0,0,160,0,0,32,128,0,160,128,0,32,0,128,160,0,128,32,128,128,160,128,128,96,0,0,224,0,0,96,128,0,224,128,0,96,0,128,224,0,128,96,128,128,224,128,128,32,64,0,160,64,0,32,192,0,160,192,0,32,64,128,160,64,128,32,192,128,160,192,128,96,64,0,224,64,0,96,192,0,224,192,0,96,64,128,224,64,128,96,192,128,224,192,128,32,0,64,160,0,64,32,128,64,160,128,64,32,0,192,160,0,192,32,128,192,160,128,192,96,0,64,224,0,64,96,128,64,224,128,64,96,0,192,224,0,192,96,128,192,224,128,192,32,64,64,160,64,64,32,192,64,160,192,64,32,64,192,160,64,192,32,192,192,160,192,192,96,64,64,224,64,64,96,192,64,224,192,64,96,64,192,224,64,192,96,192,192,224,192,192,0,32,0,128,32,0,0,160,0,128,160,0,0,32,128,128,32,128,0,160,128,128,160,128,64,32,0,192,32,0,64,160,0,192,160,0,64,32,128,192,32,128,64,160,128,192,160,128,0,96,0,128,96,0,0,224,0,128,224,0,0,96,128,128,96,128,0,224,128,128,224,128,64,96,0,192,96,0,64,224,0,192,224,0,64,96,128,192,96,128,64,224,128,192,224,128,0,32,64,128,32,64,0,160,64,128,160,64,0,32,192,128,32,192,0,160,192,128,160,192,64,32,64,192,32,64,64,160,64,192,160,64,64,32,192,192,32,192,64,160,192,192,160,192,0,96,64,128,96,64,0,224,64,128,224,64,0,96,192,128,96,192,0,224,192,128,224,192,64,96,64,192,96,64,64,224,64,192,224,64,64,96,192,192,96,192,64,224,192,192,224,192,32,32,0,160,32,0,32,160,0,160,160,0,32,32,128,160,32,128,32,160,128,160,160,128,96,32,0,224,32,0,96,160,0,224,160,0,96,32,128,224,32,128,96,160,128,224,160,128,32,96,0,160,96,0,32,224,0,160,224,0,32,96,128,160,96,128,32,224,128,160,224,128,96,96,0,224,96,0,96,224,0,224,224,0,96,96,128,224,96,128,96,224,128,224,224,128,32,32,64,160,32,64,32,160,64,160,160,64,32,32,192,160,32,192,32,160,192,160,160,192,96,32,64,224,32,64,96,160,64,224,160,64,96,32,192,224,32,192,96,160,192,224,160,192,32,96,64,160,96,64,32,224,64,160,224,64,32,96,192,160,96,192,32,224,192,160,224,192,96,96,64,224,96,64,96,224,64,224,224,64,96,96,192,224,96,192,96,224,192,0,0,0]
...@@ -53,7 +53,9 @@ def main(): ...@@ -53,7 +53,9 @@ def main():
print(model) print(model)
# criterion and optimizer # criterion and optimizer
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
optimizer = get_optimizer(args, model, False) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
if args.cuda: if args.cuda:
model.cuda() model.cuda()
# Please use CUDA_VISIBLE_DEVICES to control the number of gpus # Please use CUDA_VISIBLE_DEVICES to control the number of gpus
......
...@@ -33,7 +33,7 @@ class install(setuptools.command.install.install): ...@@ -33,7 +33,7 @@ class install(setuptools.command.install.install):
f.write('"""This is encoding version file."""\n') f.write('"""This is encoding version file."""\n')
f.write("__version__ = '{}'\n".format(version)) f.write("__version__ = '{}'\n".format(version))
version = '0.3.0' version = '0.4.0'
try: try:
sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'],
cwd=cwd).decode('ascii').strip() cwd=cwd).decode('ascii').strip()
...@@ -41,22 +41,36 @@ try: ...@@ -41,22 +41,36 @@ try:
except Exception: except Exception:
pass pass
try:
import pypandoc
readme = pypandoc.convert('README.md', 'rst')
except(IOError, ImportError):
readme = open('README.md').read()
requirements = [
'numpy',
'tqdm',
'nose',
'torch>=0.3.1',
'cffi>=1.0.0',
]
setup( setup(
name="encoding", name="encoding",
version=version, version=version,
description="PyTorch Encoding",
url="https://github.com/zhanghang1989/PyTorch-Encoding",
author="Hang Zhang", author="Hang Zhang",
author_email="zhang.hang@rutgers.edu", author_email="zhanghang0704@gmail.com",
# Require cffi. url="https://github.com/zhanghang1989/PyTorch-Encoding",
install_requires=["cffi>=1.0.0"], description="PyTorch Encoding Package",
setup_requires=["cffi>=1.0.0"], long_description=readme,
# Exclude the build files. license='MIT',
packages=find_packages(exclude=["build"]), install_requires=requirements,
# Package where to put the extensions. Has to be a prefix of build.py. packages=find_packages(exclude=["tests", "experiments"]),
package_data={'encoding': [ package_data={ 'encoding': [
'lib/*.so*', 'lib/*.dylib*', 'lib/*.so*', 'lib/*.dylib*',
'_ext/encoding_lib/*.so', '_ext/encoding_lib/*.dylib',
'kernel/*.h', 'kernel/generic/*h', 'kernel/*.h', 'kernel/generic/*h',
'src/*.h',
]}, ]},
ext_package="", ext_package="",
# Extensions to compile. # Extensions to compile.
......
...@@ -8,16 +8,21 @@ ...@@ -8,16 +8,21 @@
## LICENSE file in the root directory of this source tree ## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import encoding import numpy as np
import unittest
import torch import torch
import torch.nn.functional as F
from torch.autograd import Variable, gradcheck from torch.autograd import Variable, gradcheck
import torchvision.models as models import encoding
EPS = 1e-6 EPS = 1e-6
ATOL = 1e-4
def _assert_tensor_close(a, b, atol=ATOL, rtol=EPS):
npa, npb = a.cpu().numpy(), b.cpu().numpy()
assert np.allclose(npa, npb, atol=atol), \
'Tensor close check failed\n{}\n{}\nadiff={}, rdiff={}'.format(
a, b, np.abs(npa - npb).max(), np.abs((npa - npb) / np.fmax(npa, 1e-5)).max())
def test_aggregate(): def test_aggregate():
B,N,K,D = 2,3,4,5 B,N,K,D = 2,3,4,5
...@@ -28,7 +33,7 @@ def test_aggregate(): ...@@ -28,7 +33,7 @@ def test_aggregate():
C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5), C = Variable(torch.cuda.DoubleTensor(K,D).uniform_(-0.5,0.5),
requires_grad=True) requires_grad=True)
input = (A, X, C) input = (A, X, C)
test = gradcheck(encoding.functions.aggregate, input, eps=1e-6, atol=1e-4) test = gradcheck(encoding.functions.aggregate, input, eps=EPS, atol=ATOL)
print('Testing aggregate(): {}'.format(test)) print('Testing aggregate(): {}'.format(test))
...@@ -41,7 +46,7 @@ def test_scaledL2(): ...@@ -41,7 +46,7 @@ def test_scaledL2():
S = Variable(torch.cuda.DoubleTensor(K).uniform_(-0.5,0.5), S = Variable(torch.cuda.DoubleTensor(K).uniform_(-0.5,0.5),
requires_grad=True) requires_grad=True)
input = (X, C, S) input = (X, C, S)
test = gradcheck(encoding.functions.scaledL2, input, eps=1e-6, atol=1e-4) test = gradcheck(encoding.functions.scaledL2, input, eps=EPS, atol=ATOL)
print('Testing scaledL2(): {}'.format(test)) print('Testing scaledL2(): {}'.format(test))
...@@ -51,16 +56,16 @@ def test_encoding(): ...@@ -51,16 +56,16 @@ def test_encoding():
requires_grad=True) requires_grad=True)
input = (X,) input = (X,)
layer = encoding.nn.Encoding(C,K).double().cuda() layer = encoding.nn.Encoding(C,K).double().cuda()
test = gradcheck(layer, input, eps=1e-6, atol=1e-4) test = gradcheck(layer, input, eps=EPS, atol=ATOL)
print('Testing encoding(): {}'.format(test)) print('Testing encoding(): {}'.format(test))
def test_sum_square(): def test_sum_square():
B,C,H,W = 2,3,4,5 B,C,H = 2,3,4
X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5), X = Variable(torch.cuda.DoubleTensor(B,C,H).uniform_(-0.5,0.5),
requires_grad=True) requires_grad=True)
input = (X,) input = (X,)
test = gradcheck(encoding.functions.sum_square, input, eps=1e-6, atol=1e-4) test = gradcheck(encoding.functions.sum_square, input, eps=EPS, atol=ATOL)
print('Testing sum_square(): {}'.format(test)) print('Testing sum_square(): {}'.format(test))
...@@ -71,6 +76,97 @@ def test_all_reduce(): ...@@ -71,6 +76,97 @@ def test_all_reduce():
x.requires_grad = True x.requires_grad = True
Y = encoding.parallel.allreduce(1, *X) Y = encoding.parallel.allreduce(1, *X)
assert (len(X) == len(Y)) assert (len(X) == len(Y))
for i in range(1, ngpu):
_assert_tensor_close(Y[i].data, Y[0].data)
input = (1, *X)
#test = gradcheck(encoding.parallel.allreduce, input, eps=EPS, atol=ATOL)
test = gradcheck(encoding.parallel.allreduce, input, eps=EPS, atol=ATOL)
def _test_syncbn(train_mode=True):
# generate input
B,C,H,W = 8,3,4,5
X = Variable(torch.cuda.DoubleTensor(B,C,H,W).uniform_(-0.5,0.5),
requires_grad=True)
input = (X,)
# SyncBN using DataParallel
layer = encoding.nn.SyncBatchNorm2d(C)
model = torch.nn.DataParallel(layer).double().cuda()
layer.train(train_mode)
# grad check
test = gradcheck(model, input, eps=EPS, atol=ATOL)
print('Testing SyncBatchNorm2d(): {}'.format(test))
def _test_syncbn_func(train_mode=True):
# generate input
B, C, H = 2, 3, 4
X = Variable(torch.cuda.DoubleTensor(B,C,H).uniform_(-0.5, 0.5),
requires_grad=True)
xsum = Variable(torch.ones(C).double().cuda(), requires_grad=True)
xsqsum = Variable(torch.ones(C).double().cuda(), requires_grad=True)
gamma = Variable(torch.ones(C).double().cuda(), requires_grad=True)
beta = Variable(torch.zeros(C).double().cuda(), requires_grad=True)
gamma.requires_grad=True
beta.requires_grad=True
runningVar = torch.ones(C).double().cuda()
runningMean = torch.zeros(C).double().cuda()
N = B * H
inputs = (X, xsum, xsqsum, gamma, beta, runningMean, runningVar, N, 0.1, 1e-5, train_mode)
# grad check
test = gradcheck(encoding.functions.batchnorm.apply, inputs, eps=EPS, atol=ATOL)
print('Testing batchnorm(): {}'.format(test))
def _checkBatchNormResult(bn1, bn2, input, is_train, cuda=False):
def _find_bn(module):
for m in module.modules():
if isinstance(m, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
encoding.nn.SyncBatchNorm1d, encoding.nn.SyncBatchNorm2d)):
return m
def _syncParameters(bn1, bn2):
bn1.reset_parameters()
bn2.reset_parameters()
if bn1.affine and bn2.affine:
bn2.weight.data.copy_(bn1.weight.data)
bn2.bias.data.copy_(bn1.bias.data)
bn2.running_mean.copy_(bn1.running_mean)
bn2.running_var.copy_(bn1.running_var)
bn1.train(mode=is_train)
bn2.train(mode=is_train)
if cuda:
input = input.cuda()
# using the same values for gamma and beta
_syncParameters(_find_bn(bn1), _find_bn(bn2))
input1 = Variable(input.clone(), requires_grad=True)
output1 = bn1(input1)
input2 = Variable(input.clone(), requires_grad=True)
output2 = bn2(input2)
_assert_tensor_close(input1.data, input2.data)
_assert_tensor_close(output1.data, output2.data)
if not is_train:
return
(output1 ** 2).sum().backward()
(output2 ** 2).sum().backward()
_assert_tensor_close(input1.grad.data, input2.grad.data)
_assert_tensor_close(_find_bn(bn1).running_mean, _find_bn(bn2).running_mean)
_assert_tensor_close(_find_bn(bn1).running_var, _find_bn(bn2).running_var)
def testSyncBN():
bn = torch.nn.BatchNorm2d(10).cuda().double()
sync_bn = encoding.nn.SyncBatchNorm2d(10).double()
sync_bn = torch.nn.DataParallel(sync_bn).cuda()
# check with unsync version
for i in range(10):
_checkBatchNormResult(bn, sync_bn, torch.rand(16, 10, 16, 16).double(), True, cuda=True)
_checkBatchNormResult(bn, sync_bn, torch.rand(16, 10, 16, 16).double(), False, cuda=True)
# gradcheck
_test_syncbn_func(True)
_test_syncbn(True)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment