Commit 8f8fbb9f authored by Hang Zhang's avatar Hang Zhang
Browse files

v1.0.1

parent aa9af7fd
This diff is collapsed.
...@@ -19,12 +19,69 @@ from torch.nn.parallel.scatter_gather import scatter, scatter_kwargs, \ ...@@ -19,12 +19,69 @@ from torch.nn.parallel.scatter_gather import scatter, scatter_kwargs, \
from torch.nn.parallel.replicate import replicate from torch.nn.parallel.replicate import replicate
from torch.nn.parallel.parallel_apply import parallel_apply from torch.nn.parallel.parallel_apply import parallel_apply
def nccl_all_reduce(inputs):
# TODO, figure out why nccl all_reduce doesn't work for gradcheck
input_size = inputs[0].size()
#if nccl.is_available(inputs):
for i, inp in enumerate(inputs):
assert inp.is_cuda, \
"reduce_add expects all inputs to be on GPUs"
if inp.size() != input_size:
got = 'x'.join(str(x) for x in inp.size())
expected = 'x'.join(str(x) for x in input_size)
raise ValueError("input {} has invalid size: got {}, \
but expected {}".format(i, got, expected))
nccl.all_reduce(inputs)
return inputs
def comm_all_reduce(inputs):
# comm backend
result = comm.reduce_add(inputs)
results = []
for i in range(len(inputs)):
results.append(result.clone().cuda(i))
return results
class AllReduce(Function):
"""Cross GPU all reduce autograd operation for calculate mean and
variance in SyncBN.
"""
def forward(ctx, *inputs):
outputs = comm_all_reduce(list(inputs))
return tuple(outputs)
def backward(ctx, *gradOutputs):
gradInputs = comm_all_reduce(list(gradOutputs))
return tuple(gradInputs)
class Broadcast(Function):
"""Multi-GPU broadcast autograd function
"""
def __init__(self, target_gpus):
super(Broadcast, self).__init__()
self.target_gpus = target_gpus
def forward(self, *inputs):
if not all(input.is_cuda for input in inputs):
raise TypeError('Broadcast function not implemented for CPU tensors')
if len(inputs) == 0:
return tuple()
self.num_inputs = len(inputs)
self.input_device = inputs[0].get_device()
outputs = comm.broadcast_coalesced(inputs, self.target_gpus)
return tuple([t for tensors in outputs for t in tensors])
def backward(self, *grad_outputs):
grad_outputs = [grad_outputs[i:i + self.num_inputs]
for i in range(0, len(grad_outputs), self.num_inputs)]
return comm.reduce_add_coalesced(grad_outputs, self.input_device)
class ModelDataParallel(Module): class ModelDataParallel(Module):
"""Implements data parallelism at the module level. """Implements data parallelism at the module level.
.. ModelDataParallel_
This container parallelizes the application of the given module by This container parallelizes the application of the given module by
splitting the input across the specified devices by chunking in the splitting the input across the specified devices by chunking in the
batch dimension. batch dimension.
...@@ -32,7 +89,7 @@ class ModelDataParallel(Module): ...@@ -32,7 +89,7 @@ class ModelDataParallel(Module):
and each replica handles a portion of the input. During the backwards and each replica handles a portion of the input. During the backwards
pass, gradients from each replica are summed into the original module. pass, gradients from each replica are summed into the original module.
Note that the outputs are not gathered, please use compatible Note that the outputs are not gathered, please use compatible
CriterionDataParallel_ . :class:`encoding.parallel.CriterionDataParallel`.
The batch size should be larger than the number of GPUs used. It should The batch size should be larger than the number of GPUs used. It should
also be an integer multiple of the number of GPUs so that each chunk is also be an integer multiple of the number of GPUs so that each chunk is
...@@ -44,19 +101,29 @@ class ModelDataParallel(Module): ...@@ -44,19 +101,29 @@ class ModelDataParallel(Module):
Example:: Example::
>>> net = torch.nn.ModelDataParallel(model, device_ids=[0, 1, 2]) >>> net = encoding.nn.ModelDataParallel(model, device_ids=[0, 1, 2])
>>> output = net(input_var) >>> output = net(input_var)
""" """
def __init__(self, module, device_ids=None, dim=0): def __init__(self, module, device_ids=None, output_device=None, dim=0):
super(ModelDataParallel, self).__init__() super(ModelDataParallel, self).__init__()
if device_ids is None: if device_ids is None:
device_ids = list(range(torch.cuda.device_count())) device_ids = list(range(torch.cuda.device_count()))
if output_device is None:
output_device = device_ids[0]
self.dim = dim self.dim = dim
self.module = module self.module = module
self.device_ids = device_ids self.device_ids = device_ids
self.output_device = output_device
self.master_mean, self.master_var = {}, {} self.master_mean, self.master_var = {}, {}
if len(self.device_ids) == 1: if len(self.device_ids) == 1:
self.module.cuda(device_ids[0]) self.module.cuda(device_ids[0])
"""
# TODO FIXME temporal solution for BN
for m in self.module.modules():
classname = m.__class__.__name__
if classname.find('BatchNorm2d') != -1:
m.momentum = 0.9996
"""
def forward(self, *inputs, **kwargs): def forward(self, *inputs, **kwargs):
inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
...@@ -79,13 +146,11 @@ class ModelDataParallel(Module): ...@@ -79,13 +146,11 @@ class ModelDataParallel(Module):
class CriterionDataParallel(Module): class CriterionDataParallel(Module):
""" """
.. CriterionDataParallel_
Calculate loss in multiple-GPUs, which balance the memory usage for Calculate loss in multiple-GPUs, which balance the memory usage for
Semantic Segmentation. Semantic Segmentation.
The targets are splitted across the specified devices by chunking in The targets are splitted across the specified devices by chunking in
the batch dimension. Please use together with ModelDataParallel_ the batch dimension. Please use together with :class:`encoding.parallel.ModelDataParallel`.
""" """
def __init__(self, module, device_ids=None, output_device=None, dim=0): def __init__(self, module, device_ids=None, output_device=None, dim=0):
super(CriterionDataParallel, self).__init__() super(CriterionDataParallel, self).__init__()
...@@ -123,3 +188,158 @@ class CriterionDataParallel(Module): ...@@ -123,3 +188,158 @@ class CriterionDataParallel(Module):
return gather(outputs, output_device, dim=self.dim).mean() return gather(outputs, output_device, dim=self.dim).mean()
class SelfDataParallel(Module):
"""SelfDataParallel, please make sure you understand it before using.
Each module in the network should be in self-parallel mode,
which allows list of inputs from multiple GPUs.
Please see encoding.nn for detail, use with cautious
"""
def __init__(self, module, device_ids=None, output_device=None, dim=0):
super(SelfDataParallel, self).__init__()
if device_ids is None:
device_ids = list(range(torch.cuda.device_count()))
if output_device is None:
output_device = device_ids[0]
self.dim = dim
self.module = module
self.device_ids = device_ids
self.output_device = output_device
self.master_mean, self.master_var = {}, {}
if len(self.device_ids) == 1:
self.module.cuda(device_ids[0])
def forward(self, *inputs, **kwargs):
inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
outputs = self.module(inputs)
return outputs
def scatter(self, inputs, kwargs, device_ids):
#return my_scatter(inputs, target_gpus=device_ids)
outputs = scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
return outputs
def criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None):
assert len(modules) == len(inputs)
assert len(targets) == len(inputs)
if kwargs_tup:
assert len(modules) == len(kwargs_tup)
else:
kwargs_tup = ({},) * len(modules)
# Fast track
if len(modules) == 1:
return (modules[0](*inputs[0], *targets[0], **kwargs_tup[0]), )
lock = threading.Lock()
results = {}
def _worker(i, module, input, target, kwargs, results, lock):
var_input = input
while not isinstance(var_input, Variable):
var_input = var_input[0]
var_target = target
while not isinstance(var_target, Variable):
var_target = var_target[0]
try:
with torch.cuda.device_of(var_input):
output = module(input, *target, **kwargs)
with lock:
results[i] = output
except Exception as e:
with lock:
results[i] = e
threads = [threading.Thread(target=_worker,
args=(i, module, input, target,
kwargs, results, lock),
)
for i, (module, input, target, kwargs) in
enumerate(zip(modules, inputs, targets, kwargs_tup))]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
outputs = []
for i in range(len(inputs)):
output = results[i]
if isinstance(output, Exception):
raise output
outputs.append(output)
return outputs
def get_a_var(obj):
if isinstance(obj, Variable):
return obj
if isinstance(obj, list) or isinstance(obj, tuple):
results = map(get_a_var, obj)
for result in results:
if isinstance(result, Variable):
return result
if isinstance(obj, dict):
results = map(get_a_var, obj.items())
for result in results:
if isinstance(result, Variable):
return result
return None
def my_parallel_apply(modules, inputs, kwargs_tup=None):
assert len(modules) == len(inputs)
if kwargs_tup:
assert len(modules) == len(kwargs_tup)
else:
kwargs_tup = ({},) * len(modules)
# Fast track
if len(modules) == 1:
return (modules[0](*inputs[0], **kwargs_tup[0]), )
lock = threading.Lock()
results = {}
def _worker(i, module, input, kwargs, results, lock):
var_input = get_a_var(input)
try:
with torch.cuda.device_of(var_input):
output = module(input, **kwargs)
with lock:
results[i] = output
except Exception as e:
with lock:
results[i] = e
threads = [threading.Thread(target=_worker,
args=(i, module, input, kwargs, results, lock),
)
for i, (module, input, kwargs) in
enumerate(zip(modules, inputs, kwargs_tup))]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
outputs = []
for i in range(len(inputs)):
output = results[i]
if isinstance(output, Exception):
raise output
outputs.append(output)
return outputs
def my_data_parallel(module, inputs, device_ids=None, \
dim=0, module_kwargs=None):
if device_ids is None:
device_ids = list(range(torch.cuda.device_count()))
if len(inputs) == 1:
return module(inputs[0])
#print('my data parallel, len(inputs)', len(inputs))
replicas = replicate(module, device_ids[:len(inputs)])
outputs = my_parallel_apply(replicas, inputs, module_kwargs)
return outputs
...@@ -17,12 +17,26 @@ extern THCState *state; ...@@ -17,12 +17,26 @@ extern THCState *state;
extern "C" { extern "C" {
#endif #endif
// float
#include "generic/encoding_generic.c" #include "generic/encoding_generic.c"
#include "THC/THCGenerateFloatType.h" #include "THC/THCGenerateFloatType.h"
#include "generic/syncbn_generic.c"
#include "THC/THCGenerateFloatType.h"
#include "generic/pooling_generic.c"
#include "THC/THCGenerateFloatType.h"
// double
#include "generic/encoding_generic.c" #include "generic/encoding_generic.c"
#include "THC/THCGenerateDoubleType.h" #include "THC/THCGenerateDoubleType.h"
#include "generic/syncbn_generic.c"
#include "THC/THCGenerateDoubleType.h"
#include "generic/pooling_generic.c"
#include "THC/THCGenerateDoubleType.h"
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
...@@ -64,10 +64,22 @@ int Encoding_Float_batchnorm_Backward(THCudaTensor *gradoutput_, ...@@ -64,10 +64,22 @@ int Encoding_Float_batchnorm_Backward(THCudaTensor *gradoutput_,
int Encoding_Float_sum_square_Forward(THCudaTensor *input_, int Encoding_Float_sum_square_Forward(THCudaTensor *input_,
THCudaTensor *sum_, THCudaTensor *square_); THCudaTensor *sum_, THCudaTensor *square_);
void Encoding_Float_sum_square_Backward( int Encoding_Float_sum_square_Backward(
THCudaTensor *gradInput, THCudaTensor *input_, THCudaTensor *gradInput, THCudaTensor *input_,
THCudaTensor *gradSum_, THCudaTensor *gradSquare_); THCudaTensor *gradSum_, THCudaTensor *gradSquare_);
int Encoding_Float_DilatedAvgPool2d_Forward(
THCudaTensor *X_, THCudaTensor *Y_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
int Encoding_Float_DilatedAvgPool2d_Backward(
THCudaTensor *gradX_, THCudaTensor *gradY_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int Encoding_Double_scaledl2_forward(THCudaDoubleTensor *SL, int Encoding_Double_scaledl2_forward(THCudaDoubleTensor *SL,
...@@ -124,3 +136,15 @@ int Encoding_Double_sum_square_Forward(THCudaDoubleTensor *input_, ...@@ -124,3 +136,15 @@ int Encoding_Double_sum_square_Forward(THCudaDoubleTensor *input_,
void Encoding_Double_sum_square_Backward( void Encoding_Double_sum_square_Backward(
THCudaDoubleTensor *gradInput, THCudaDoubleTensor *input_, THCudaDoubleTensor *gradInput, THCudaDoubleTensor *input_,
THCudaDoubleTensor *gradSum_, THCudaDoubleTensor *gradSquare_); THCudaDoubleTensor *gradSum_, THCudaDoubleTensor *gradSquare_);
int Encoding_Double_DilatedAvgPool2d_Forward(
THCudaDoubleTensor *X_, THCudaDoubleTensor *Y_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
int Encoding_Double_DilatedAvgPool2d_Backward(
THCudaDoubleTensor *gradX_, THCudaDoubleTensor *gradY_,
int kH, int kW, int dH, int dW,
int padH, int padW,
int dilationH, int dilationW);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
cd models
wget -O minc.pth.tar https://www.dropbox.com/s/0q57t0nd1tka2qx/minc.pth.tar?dl=1
cd ..
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment