Commit aa9af7fd authored by Hang Zhang's avatar Hang Zhang
Browse files

docs

parent a4d71e75
docs/source/_static/img/icon.png

12.7 KB | W: | H:

docs/source/_static/img/icon.png

101 KB | W: | H:

docs/source/_static/img/icon.png
docs/source/_static/img/icon.png
docs/source/_static/img/icon.png
docs/source/_static/img/icon.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -23,6 +23,7 @@ PyTorch-Encoding is an optimized PyTorch package using GPU, including Encoding L
encoding
syncbn
parallel
Indices and tables
......
.. role:: hidden
:class: hidden-section
Data Parallel
=============
Current PyTorch DataParallel Table is not supporting mutl-gpu loss calculation, which makes the gpu memory usage very in-efficient. We address this issue here by doing CriterionDataParallel.
The DataParallel compatible with SyncBN will be released later.
Modules
-------
.. currentmodule:: encoding
:hidden:`ModelDataParallel`
~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: ModelDataParallel
:members:
:hidden:`CriterionDataParallel`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: CriterionDataParallel
:members:
......@@ -6,6 +6,7 @@ Synchronized BatchNorm
The current BN is implementated insynchronized accross the gpus, which is a big problem for memory consuming tasks such as Semantic Segmenation, since the mini-batch is very small.
To synchronize the batchnorm accross multiple gpus is not easy to implment within the current Dataparallel framework. We address this difficulty by making each layer 'self-parallel', that is accepting the inputs from multi-gpus. Therefore, we can handle different layers seperately for synchronizing it across gpus.
We will release the whole SyncBN Module and compatible DataParallel later.
.. currentmodule:: encoding
......
......@@ -10,5 +10,6 @@
from .functions import *
from .modules import *
from .syncbn import *
from .syncbn import sum_square, batchnormtrain, batchnormeval
from .parallel import ModelDataParallel, CriterionDataParallel
......@@ -243,7 +243,7 @@ def assign(R, S):
Calculate assignment weights for given residuals (:math:`R`) and scale (:math:`S`)
.. math::
a_{ik} = \frac{exp(-s_k\|x_{i}-c_k\|^2)}{\sum_{j=1}^K exp(-s_j\|x_{i}-c_j\|^2)}
a_{ik} = \frac{exp(-s_k\|r_{ik}\|^2)}{\sum_{j=1}^K exp(-s_j\|r_{ik}\|^2)}
Shape:
- Input: :math:`R\in\mathcal{R}^{B\times N\times K\times D}` :math:`S\in \mathcal{R}^K` (where :math:`B` is batch, :math:`N` is total number of features, :math:`K` is number is codewords, :math:`D` is feature dimensions.)
......
......@@ -19,7 +19,8 @@ from ..functions import *
class Encoding(nn.Module):
r"""
Encoding Layer: learnable residual encoders over 3d or 4d input that is seen as a mini-batch.
Encoding Layer: learnable residual encoders over 3d or 4d input that
is seen as a mini-batch.
.. math::
......@@ -92,7 +93,8 @@ class Encoding(nn.Module):
class Aggregate(nn.Module):
r"""
Aggregate operation, aggregate the residuals (:math:`R`) with assignment weights (:math:`A`).
Aggregate operation, aggregate the residuals (:math:`R`) with
assignment weights (:math:`A`).
.. math::
e_{k} = \sum_{i=1}^{N} a_{ik} r_{ik}
......
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import threading
import torch
import torch.cuda.nccl as nccl
import torch.cuda.comm as comm
from torch.autograd import Variable, Function
from torch.nn.modules import Module
from torch.nn.parallel.scatter_gather import scatter, scatter_kwargs, \
gather
from torch.nn.parallel.replicate import replicate
from torch.nn.parallel.parallel_apply import parallel_apply
class ModelDataParallel(Module):
"""Implements data parallelism at the module level.
.. ModelDataParallel_
This container parallelizes the application of the given module by
splitting the input across the specified devices by chunking in the
batch dimension.
In the forward pass, the module is replicated on each device,
and each replica handles a portion of the input. During the backwards
pass, gradients from each replica are summed into the original module.
Note that the outputs are not gathered, please use compatible
CriterionDataParallel_ .
The batch size should be larger than the number of GPUs used. It should
also be an integer multiple of the number of GPUs so that each chunk is
the same size (so that each GPU processes the same number of samples).
Args:
module: module to be parallelized
device_ids: CUDA devices (default: all devices)
Example::
>>> net = torch.nn.ModelDataParallel(model, device_ids=[0, 1, 2])
>>> output = net(input_var)
"""
def __init__(self, module, device_ids=None, dim=0):
super(ModelDataParallel, self).__init__()
if device_ids is None:
device_ids = list(range(torch.cuda.device_count()))
self.dim = dim
self.module = module
self.device_ids = device_ids
self.master_mean, self.master_var = {}, {}
if len(self.device_ids) == 1:
self.module.cuda(device_ids[0])
def forward(self, *inputs, **kwargs):
inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
if len(self.device_ids) == 1:
return self.module(*inputs[0], **kwargs[0])
replicas = self.replicate(self.module, \
self.device_ids[:len(inputs)])
outputs = self.parallel_apply(replicas, inputs, kwargs)
return outputs
def replicate(self, module, device_ids):
return replicate(module, device_ids)
def scatter(self, inputs, kwargs, device_ids):
return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
def parallel_apply(self, replicas, inputs, kwargs):
return parallel_apply(replicas, inputs, kwargs)
class CriterionDataParallel(Module):
"""
.. CriterionDataParallel_
Calculate loss in multiple-GPUs, which balance the memory usage for
Semantic Segmentation.
The targets are splitted across the specified devices by chunking in
the batch dimension. Please use together with ModelDataParallel_
"""
def __init__(self, module, device_ids=None, output_device=None, dim=0):
super(CriterionDataParallel, self).__init__()
if device_ids is None:
device_ids = list(range(torch.cuda.device_count()))
if output_device is None:
output_device = device_ids[0]
self.dim = dim
self.module = module
self.device_ids = device_ids
self.output_device = output_device
if len(self.device_ids) == 1:
self.module.cuda(device_ids[0])
def forward(self, inputs, *targets, **kwargs):
# input should be already scatterd
# scattering the targets instead
targets, kwargs = self.scatter(targets, kwargs, self.device_ids)
if len(self.device_ids) == 1:
return self.module(inputs, *targets[0], **kwargs[0])
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
outputs = self.parallel_apply(replicas, inputs, targets, kwargs)
return self.gather(outputs, self.output_device)
def replicate(self, module, device_ids):
return replicate(module, device_ids)
def scatter(self, inputs, kwargs, device_ids):
return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
def parallel_apply(self, replicas, inputs, targets, kwargs):
return criterion_parallel_apply(replicas, inputs, targets, kwargs)
def gather(self, outputs, output_device):
return gather(outputs, output_device, dim=self.dim).mean()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment