"torchvision/vscode:/vscode.git/clone" did not exist on "b6f55ed899fdd8a6b73c7a666d30e9afc8f5f314"
syncbn.py 7.66 KB
Newer Older
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
1
2
3
4
5
6
7
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
Hang Zhang's avatar
sync BN  
Hang Zhang committed
8
## LICENSE file in the root directory of this source tree
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
9
10
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Hang Zhang's avatar
sync BN  
Hang Zhang committed
11
"""Synchronized Cross-GPU Batch Normalization Module"""
Hang Zhang's avatar
Hang Zhang committed
12
13
import functools
import collections
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
14
15
import threading
import torch
Hang Zhang's avatar
sync BN  
Hang Zhang committed
16
from torch.nn import Module, Sequential, Conv1d, Conv2d, ConvTranspose2d, \
Hang Zhang's avatar
Hang Zhang committed
17
18
    ReLU, Sigmoid, MaxPool2d, AvgPool2d, AdaptiveAvgPool2d, Dropout2d, Linear, \
    DataParallel
Hang Zhang's avatar
Hang Zhang committed
19
from torch.nn.modules.batchnorm import _BatchNorm
Hang Zhang's avatar
Hang Zhang committed
20
21
from torch.nn.functional import batch_norm
from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
22

Hang Zhang's avatar
Hang Zhang committed
23
from ..functions import *
Hang Zhang's avatar
sync BN  
Hang Zhang committed
24
from ..parallel import allreduce
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
25

Hang Zhang's avatar
pylint  
Hang Zhang committed
26
27
28
__all__ = ['BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'Module', 'Sequential', 'Conv1d',
           'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d', 'AvgPool2d',
           'AdaptiveAvgPool2d', 'Dropout2d', 'Linear']
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
29

Hang Zhang's avatar
Hang Zhang committed
30

Hang Zhang's avatar
Hang Zhang committed
31
class _SyncBatchNorm(_BatchNorm):
Hang Zhang's avatar
Hang Zhang committed
32
33
34
35
36
37
38
    def __init__(self, num_features, eps=1e-5, momentum=0.001, affine=True):
        super(_SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine)

        self._is_parallel = False
        self._parallel_id = None
        self._slave_pipe = None
        self.sharedT = SharedTensor(torch.cuda.device_count())
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
39
40

    def forward(self, input):
Hang Zhang's avatar
Hang Zhang committed
41
        # Resize the input to (B, C, -1).
Hang Zhang's avatar
Hang Zhang committed
42
43
44
45
46
47
        input_shape = input.size()
        input = input.view(input_shape[0], self.num_features, -1)
        if not self.training:
            std = (self.running_var.clamp(self.eps)).sqrt()
            output = batchnormeval(input, self.weight, self.bias, self.running_mean, std)
            return output.view(input_shape)
Hang Zhang's avatar
Hang Zhang committed
48
49
50
51
52
53
54
55
56
57

        # sum(x) and sum(x^2)
        N = input.size(0) * input.size(2)
        xsum, xsqsum = sum_square(input)

        # all-reduce for global sum(x) and sum(x^2)
        igpu = input.get_device()
        self.sharedT.push(N, igpu, xsum, xsqsum)
        N, xsum, xsqsum = self.sharedT.pull(igpu)

Hang Zhang's avatar
Hang Zhang committed
58
59
        # calculate mean, var
        mean = xsum / N
Hang Zhang's avatar
Hang Zhang committed
60
        sumvar = xsqsum - xsum * xsum / N
Hang Zhang's avatar
Hang Zhang committed
61
62
63
        unbias_var = sumvar / (N - 1)
        bias_var = sumvar / N
        std = bias_var.clamp(self.eps).sqrt()
Hang Zhang's avatar
Hang Zhang committed
64

Hang Zhang's avatar
Hang Zhang committed
65
66
67
        # update running_mean and var
        self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * mean.data
        self.running_var = (1-self.momentum) * self.running_var + self.momentum * unbias_var.data
Hang Zhang's avatar
Hang Zhang committed
68

Hang Zhang's avatar
Hang Zhang committed
69
70
71
72
73
74
75
76
77
78
        # forward
        return batchnormtrain(input, self.weight, self.bias, mean, std).view(input_shape)


class BatchNorm1d(_SyncBatchNorm):
    r"""Please see the docs in :class:`encoding.nn.BatchNorm2d`"""
    def _check_input_dim(self, input):
        if input.dim() != 2 and input.dim() != 3:
            raise ValueError('expected 2D or 3D input (got {}D input)'
                             .format(input.dim()))
Hang Zhang's avatar
Hang Zhang committed
79
80
        super(BatchNorm2d, self)._check_input_dim(input)

Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
81

Hang Zhang's avatar
Hang Zhang committed
82
class BatchNorm2d(_SyncBatchNorm):
Hang Zhang's avatar
sync BN  
Hang Zhang committed
83
    r"""Cross-GPU Synchronized Batch normalization (SyncBN)
Hang Zhang's avatar
v0.1.0  
Hang Zhang committed
84

Hang Zhang's avatar
sync BN  
Hang Zhang committed
85
86
87
    Standard BN [1]_ implementation only normalize the data within each device.
    SyncBN normalizes the input within the whole mini-batch.
    We follow the sync-onece implmentation described in the paper [2]_ .
Hang Zhang's avatar
Hang Zhang committed
88
    Please see the design idea in the `notes <./notes/syncbn.html>`_.
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
89

Hang Zhang's avatar
Hang Zhang committed
90
91
92
    .. note::
        Please use ``CUDA_VISIBLE_DEVICES`` to select number of GPUs.

Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
93
94
    .. math::

Hang Zhang's avatar
sync BN  
Hang Zhang committed
95
        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
96

Hang Zhang's avatar
Hang Zhang committed
97
    The mean and standard-deviation are calculated per-channel over
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
98
99
100
101
102
103
104
105
    the mini-batches and gamma and beta are learnable parameter vectors
    of size C (where C is the input size).

    During training, this layer keeps a running estimate of its computed mean
    and variance. The running sum is kept with a default momentum of 0.1.

    During evaluation, this running mean/variance is used for normalization.

Hang Zhang's avatar
sync BN  
Hang Zhang committed
106
107
108
    Because the BatchNorm is done over the `C` dimension, computing statistics
    on `(N, H, W)` slices, it's common terminology to call this Spatial BatchNorm

Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
109
110
111
112
113
114
115
    Args:
        num_features: num_features from an expected input of
            size batch_size x num_features x height x width
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Default: 0.1
Hang Zhang's avatar
sync BN  
Hang Zhang committed
116
117
        affine: a boolean value that when set to ``True``, gives the layer learnable
            affine parameters. Default: ``True``
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
118
119
120
121
122

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

Hang Zhang's avatar
sync BN  
Hang Zhang committed
123
124
125
126
    Reference:
        .. [1] Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." *ICML 2015*
        .. [2] Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, and Amit Agrawal. "Context Encoding for Semantic Segmentation." *CVPR 2018*

Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
127
    Examples:
Hang Zhang's avatar
sync BN  
Hang Zhang committed
128
        >>> # Use exactly the same as standard BatchNrom2d
Hang Zhang's avatar
Hang Zhang committed
129
130
131
        >>> m = BatchNorm2d(100)
        >>> net = torch.nn.DataParallel(m)
        >>> output = net(input)
Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
132
133
134
135
136
    """
    def _check_input_dim(self, input):
        if input.dim() != 4:
            raise ValueError('expected 4D input (got {}D input)'
                             .format(input.dim()))
Hang Zhang's avatar
Hang Zhang committed
137
138
        super(BatchNorm2d, self)._check_input_dim(input)

Hang Zhang's avatar
v1.0.1  
Hang Zhang committed
139

Hang Zhang's avatar
Hang Zhang committed
140
141
142
143
144
145
class BatchNorm3d(_SyncBatchNorm):
    r"""Please see the docs in :class:`encoding.nn.BatchNorm2d`"""
    def _check_input_dim(self, input):
        if input.dim() != 5:
            raise ValueError('expected 5D input (got {}D input)'
                             .format(input.dim()))
Hang Zhang's avatar
Hang Zhang committed
146
147
        super(BatchNorm3d, self)._check_input_dim(input)

Hang Zhang's avatar
sync BN  
Hang Zhang committed
148
149

class SharedTensor(object):
Hang Zhang's avatar
Hang Zhang committed
150
    """Shared Tensor for cross GPU all reduce operation"""
Hang Zhang's avatar
sync BN  
Hang Zhang committed
151
152
153
154
155
156
157
    def __init__(self, nGPUs):
        self.mutex = threading.Lock()
        self.all_tasks_done = threading.Condition(self.mutex)
        self.nGPUs = nGPUs
        self._clear()

    def _clear(self):
Hang Zhang's avatar
Hang Zhang committed
158
159
        self.N = 0
        self.dict = {}
Hang Zhang's avatar
sync BN  
Hang Zhang committed
160
161
162
        self.push_tasks = self.nGPUs
        self.reduce_tasks = self.nGPUs

Hang Zhang's avatar
Hang Zhang committed
163
164
165
    def push(self, *inputs):
        if self.nGPUs <= 1:
            return tuple(inputs)
Hang Zhang's avatar
Hang Zhang committed
166
        # push from device
Hang Zhang's avatar
sync BN  
Hang Zhang committed
167
168
169
        with self.mutex:
            if self.push_tasks == 0:
                self._clear()
Hang Zhang's avatar
Hang Zhang committed
170
171
172
173
            self.N += inputs[0]
            igpu = inputs[1]
            self.dict[igpu] = inputs[2:]
            #idx = self.nGPUs - self.push_tasks
Hang Zhang's avatar
sync BN  
Hang Zhang committed
174
175
176
177
178
179
            self.push_tasks -= 1
        with self.all_tasks_done:
            if self.push_tasks == 0:
                self.all_tasks_done.notify_all()
            while self.push_tasks:
                self.all_tasks_done.wait()
Hang Zhang's avatar
Hang Zhang committed
180
181

    def pull(self, igpu):
Hang Zhang's avatar
Hang Zhang committed
182
        # pull from device
Hang Zhang's avatar
sync BN  
Hang Zhang committed
183
        with self.mutex:
Hang Zhang's avatar
Hang Zhang committed
184
185
186
187
188
            if igpu == 0:
                assert(len(self.dict) == self.nGPUs)
                # flatten the tensors
                self.list = [t for i in range(len(self.dict)) for t in self.dict[i]]
                self.outlist = allreduce(2, *self.list)
Hang Zhang's avatar
sync BN  
Hang Zhang committed
189
190
191
192
193
194
195
196
                self.reduce_tasks -= 1
            else:
                self.reduce_tasks -= 1
        with self.all_tasks_done:
            if self.reduce_tasks == 0:
                self.all_tasks_done.notify_all()
            while self.reduce_tasks:
                self.all_tasks_done.wait()
Hang Zhang's avatar
Hang Zhang committed
197
        # all reduce done
Hang Zhang's avatar
Hang Zhang committed
198
        return self.N, self.outlist[2*igpu], self.outlist[2*igpu+1]
Hang Zhang's avatar
sync BN  
Hang Zhang committed
199
200

    def __len__(self):
Hang Zhang's avatar
Hang Zhang committed
201
        return self.nGPUs
Hang Zhang's avatar
sync BN  
Hang Zhang committed
202
203
204

    def __repr__(self):
        return ('SharedTensor')
Hang Zhang's avatar
Hang Zhang committed
205