Commit 9661dbd7 authored by Michael Carilli's avatar Michael Carilli
Browse files

Updating distributed example

parent 789afd89
...@@ -6,6 +6,7 @@ import torch.nn.functional as F ...@@ -6,6 +6,7 @@ import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
from torchvision import datasets, transforms from torchvision import datasets, transforms
from torch.autograd import Variable from torch.autograd import Variable
from apex.fp16_utils import to_python_float
#=====START: ADDED FOR DISTRIBUTED====== #=====START: ADDED FOR DISTRIBUTED======
'''Add custom module for distributed''' '''Add custom module for distributed'''
...@@ -83,8 +84,10 @@ if args.distributed: ...@@ -83,8 +84,10 @@ if args.distributed:
torch.cuda.set_device(args.rank % torch.cuda.device_count()) torch.cuda.set_device(args.rank % torch.cuda.device_count())
'''Initialize distributed communication''' '''Initialize distributed communication'''
dist.init_process_group(args.dist_backend, init_method=args.dist_url, dist.init_process_group(args.dist_backend,
world_size=args.world_size) init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
#=====END: ADDED FOR DISTRIBUTED====== #=====END: ADDED FOR DISTRIBUTED======
...@@ -174,18 +177,19 @@ def train(epoch): ...@@ -174,18 +177,19 @@ def train(epoch):
if batch_idx % args.log_interval == 0: if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset), epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.data[0])) 100. * batch_idx / len(train_loader), to_python_float(loss.data)))
def test(): def test():
model.eval() model.eval()
test_loss = 0 test_loss = 0
correct = 0 correct = 0
for data, target in test_loader: for data, target in test_loader:
with torch.no_grad():
if args.cuda: if args.cuda:
data, target = data.cuda(), target.cuda() data, target = data.cuda(), target.cuda()
data, target = Variable(data, volatile=True), Variable(target) data, target = Variable(data), Variable(target)
output = model(data) output = model(data)
test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss test_loss += to_python_float(F.nll_loss(output, target, size_average=False).data) # sum up batch loss
pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
correct += pred.eq(target.data.view_as(pred)).cpu().sum() correct += pred.eq(target.data.view_as(pred)).cpu().sum()
......
python -m apex.parallel.multiproc main.py export CUDA_VISIBLE_DEVICES=0,1; python -m apex.parallel.multiproc main.py
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment