Commit aa817132 authored by Michael Carilli's avatar Michael Carilli
Browse files

Ported examples to use torch.distributed.launch

parent 12b49b98
**distributed_data_parallel.py** shows an example using `FP16_Optimizer` with
`apex.parallel.DistributedDataParallel`.
**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
`apex.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
single-process usage. Test via
```bash
......
......@@ -5,20 +5,12 @@ from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import FP16_Optimizer
parser = argparse.ArgumentParser()
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--world-size', default=2, type=int,
help='Number of distributed processes.')
parser.add_argument("--rank", type=int,
help='Rank of this process')
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()
torch.cuda.set_device(args.rank)
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl',
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
init_method='env://')
torch.backends.cudnn.benchmark = True
......
#!/bin/bash
# By default, apex.parallel.multiproc will attempt to use all available GPUs on the system.
# The number of GPUs to use can be limited by setting CUDA_VISIBLE_DEVICES:
export CUDA_VISIBLE_DEVICES=0,1
python -m apex.parallel.multiproc distributed_data_parallel.py
python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
`apex.parallel.DistributedDataParallel` in conjuction with the legacy Apex
launcher script, `apex.parallel.multiproc`. See
[FP16_Optimizer_simple/distributed_apex](https://github.com/NVIDIA/apex/tree/torch_launcher/examples/FP16_Optimizer_simple/distributed_apex) for a more up-to-date example that uses the Pytorch launcher
script, `torch.distributed.launch`.
The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
single-process usage. Test via
```bash
bash run.sh
```
import torch
from torch.autograd import Variable
import argparse
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import FP16_Optimizer
parser = argparse.ArgumentParser()
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--world-size', default=2, type=int,
help='Number of distributed processes.')
parser.add_argument("--rank", type=int,
help='Rank of this process')
args = parser.parse_args()
torch.cuda.set_device(args.rank)
torch.distributed.init_process_group(backend='nccl',
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
torch.backends.cudnn.benchmark = True
N, D_in, D_out = 64, 1024, 16
x = Variable(torch.cuda.FloatTensor(N, D_in ).normal_()).half()
y = Variable(torch.cuda.FloatTensor(N, D_out).normal_()).half()
model = torch.nn.Linear(D_in, D_out).cuda().half()
model = DDP(model)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
### Construct FP16_Optimizer ###
optimizer = FP16_Optimizer(optimizer)
###
loss_fn = torch.nn.MSELoss()
for t in range(500):
optimizer.zero_grad()
y_pred = model(x)
loss = loss_fn(y_pred.float(), y.float())
### Change loss.backward() to: ###
optimizer.backward(loss)
###
optimizer.step()
print("final loss = ", loss)
#!/bin/bash
# By default, apex.parallel.multiproc will attempt to use all available GPUs on the system.
# The number of GPUs to use can be limited by setting CUDA_VISIBLE_DEVICES:
export CUDA_VISIBLE_DEVICES=0,1
python -m apex.parallel.multiproc distributed_data_parallel.py
**distributed_data_parallel.py** shows an example using `FP16_Optimizer` with
`torch.nn.parallel.DistributedDataParallel`.
**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
`torch.nn.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
The usage of `FP16_Optimizer` with distributed does not need to change from ordinary
single-process usage. Test via
```bash
......
......@@ -4,7 +4,7 @@ import argparse
from apex.fp16_utils import FP16_Optimizer
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int)
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()
torch.cuda.set_device(args.local_rank)
......
# Multiprocess Example based on pytorch/examples/mnist
main.py demonstrates how to modify a simple model to enable multiprocess distributed data parallel
training using the module wrapper `apex.parallel.DistributedDataParallel`
training using the module wrapper `apex.parallel.DistributedDataParallel`
(similar to `torch.nn.parallel.DistributedDataParallel`).
Multiprocess distributed data parallel training frequently outperforms single-process
......@@ -29,27 +29,25 @@ To download the dataset, run
```python main.py```
without any arguments. Once you have downloaded the dataset, you should not need to do this again.
You can now launch multi-process distributed data parallel jobs via
`main.py` runs multiprocess distributed data parallel jobs using the Pytorch launcher script
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
Jobs are launched via
```bash
python -m apex.parallel.multiproc main.py args...
python -m torch.distributed.launch --nproc_per_node=N main.py args...
```
adding any `args...` you like. The launch script `apex.parallel.multiproc` will
spawn one process for each of your system's available (visible) GPUs.
Each process will run `python main.py args... --world-size <worldsize> --rank <rank>`
(the `--world-size` and `--rank` arguments are determined and appended by `apex.parallel.multiproc`).
Each `main.py` calls `torch.cuda.set_device()` and `torch.distributed.init_process_group()`
according to the `rank` and `world-size` arguments it receives.
The number of visible GPU devices (and therefore the number of processes
`DistributedDataParallel` will spawn) can be controlled by setting the environment variable
`CUDA_VISIBLE_DEVICES`. For example, if you `export CUDA_VISIBLE_DEVICES=0,1` and run
```python -m apex.parallel.multiproc main.py ...```, the launch utility will spawn two processes
which will run on devices 0 and 1. By default, if `CUDA_VISIBLE_DEVICES` is unset,
`apex.parallel.multiproc` will attempt to use every device on the node.
`torch.distributed.launch` spawns `N` processes, each of which runs as
`python main.py args... --local_rank <rank>`.
The `local_rank` argument for each process is determined and appended by `torch.distributed.launch`,
and varies between 0 and `N-1`. `torch.distributed.launch` also provides environment variables
for each process.
Internally, each process calls `set_device` according to its local
rank and `init_process_group` with `init_method=`env://' to ingest the provided environment
variables.
For best performance, set `N` equal to the number of visible CUDA devices on the node.
## Converting your own model
To understand how to convert your own model, please see all sections of main.py within ```#=====START: ADDED FOR DISTRIBUTED======``` and ```#=====END: ADDED FOR DISTRIBUTED======``` flags.
## Requirements
Pytorch master branch built from source. This requirement is to use NCCL as a distributed backend.
Pytorch with NCCL available as a distributed backend. Pytorch 0.4+, installed as a pip or conda package, should have this by default. Otherwise, you can build Pytorch from source, in an environment where NCCL is installed and visible.
from __future__ import print_function
import argparse
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
......@@ -49,20 +50,10 @@ parser.add_argument('--log-interval', type=int, default=10, metavar='N',
Add some distributed options. For explanation of dist-url and dist-backend please see
http://pytorch.org/tutorials/intermediate/dist_tuto.html
--world-size and --rank are required parameters as they will be used by the multiproc.py launcher
but do not have to be set explicitly.
--local_rank will be supplied by the Pytorch launcher wrapper (torch.distributed.launch)
'''
parser.add_argument("--local_rank", default=0, type=int)
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--world-size', default=1, type=int,
help='Number of GPUs to use. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--rank', default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
#=====END: ADDED FOR DISTRIBUTED======
args = parser.parse_args()
......@@ -70,24 +61,23 @@ args.cuda = not args.no_cuda and torch.cuda.is_available()
#======START: ADDED FOR DISTRIBUTED======
'''Add a convenience flag to see if we are running distributed'''
args.distributed = args.world_size > 1
args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
'''Check that we are running with cuda, as distributed is only supported for cuda.'''
if args.distributed:
'''Check that we are running with cuda, as distributed is only supported for cuda.'''
assert args.cuda, "Distributed mode requires running with CUDA."
if args.distributed:
'''
Set cuda device so everything is done on the right GPU.
THIS MUST BE DONE AS SOON AS POSSIBLE.
'''
torch.cuda.set_device(args.rank % torch.cuda.device_count())
torch.cuda.set_device(args.local_rank)
'''Initialize distributed communication'''
dist.init_process_group(args.dist_backend,
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
torch.distributed.init_process_group(backend='nccl',
init_method='env://')
#=====END: ADDED FOR DISTRIBUTED======
......@@ -145,7 +135,7 @@ class Net(nn.Module):
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
return F.log_softmax(x, dim=1)
model = Net()
if args.cuda:
......@@ -174,7 +164,7 @@ def train(epoch):
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
if batch_idx % args.log_interval == 0 and args.local_rank == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), to_python_float(loss.data)))
......@@ -194,6 +184,7 @@ def test():
correct += pred.eq(target.data.view_as(pred)).cpu().sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
......@@ -206,4 +197,5 @@ for epoch in range(1, args.epochs + 1):
#=====END: ADDED FOR DISTRIBUTED======
train(epoch)
test()
if args.local_rank == 0:
test()
......@@ -3,20 +3,12 @@
This example is based on [https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet).
It implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.
`main.py` and `main_fp16_optimizer.py` have been modified to use the `DistributedDataParallel` module in Apex instead of the one in upstream PyTorch. For description of how this works please see the distributed example included in this repo.
`main.py` with the `--fp16` argument demonstrates mixed precision training with manual management of master parameters and loss scaling.
`main_fp16_optimizer.py` with `--fp16` demonstrates use of `apex.fp16_utils.FP16_Optimizer` to automatically manage master parameters and loss scaling.
To run multi-gpu on a single node use the command
```python -m apex.parallel.multiproc main.py ...```
adding any normal arguments.
## Requirements
- Apex which can be installed from https://www.github.com/nvidia/apex
- Install PyTorch from source, master branch of [pytorch on github](https://www.github.com/pytorch/pytorch).
- `pip install -r requirements.txt`
- Download the ImageNet dataset and move validation images to labeled subfolders
- To do this, you can use the following script: https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh
......@@ -34,7 +26,21 @@ python main.py -a alexnet --lr 0.01 /path/to/imagenet/folder
The directory at /path/to/imagenet/directory should contain two subdirectories called "train"
and "val" that contain the training and validation data respectively. Train images are expected to be 256x256 jpegs.
**Example commands:** (note: batch size `--b 256` assumes your GPUs have >=16GB of onboard memory)
## Distributed training
`main.py` and `main_fp16_optimizer.py` have been modified to use the `DistributedDataParallel` module in Apex instead of the one in upstream PyTorch. `apex.parallel.DistributedDataParallel`
is a drop-in replacement for `torch.nn.parallel.DistribtuedDataParallel` (see our [distributed example](https://github.com/NVIDIA/apex/tree/master/examples/distributed)).
The scripts can interact with
[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility)
to spawn multiprocess jobs using the following syntax:
```
python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main.py args...
```
`NUM_GPUS` should be less than or equal to the number of visible GPU devices on the node.
## Example commands
(note: batch size `--b 256` assumes your GPUs have >=16GB of onboard memory)
```bash
### Softlink training dataset into current directory
......@@ -44,69 +50,21 @@ $ ln -sf /data/imagenet/val-jpeg/ val
### Single-process training
$ python main.py -a resnet50 --fp16 --b 256 --workers 4 ./
### Multi-process training (uses all visible GPU on the node)
$ python -m apex.parallel.multiproc main.py -a resnet50 --fp16 --b 256 --workers 4 ./
$ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main.py -a resnet50 --fp16 --b 256 --workers 4 ./
### Multi-process training on GPUs 0 and 1 only
$ export CUDA_VISIBLE_DEVICES=0,1
$ python -m apex.parallel.multiproc main.py -a resnet50 --fp16 --b 256 --workers 4 ./
$ python -m torch.distributed.launch --nproc_per_node=2 main.py -a resnet50 --fp16 --b 256 --workers 4 ./
### Multi-process training with FP16_Optimizer, default loss scale 1.0 (still uses FP32 master params)
$ python -m apex.parallel.multiproc main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --workers 4 ./
$ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --workers 4 ./
# Multi-process training with FP16_Optimizer, static loss scale
$ python -m apex.parallel.multiproc main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --static-loss-scale 128.0 --workers 4 ./
$ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --static-loss-scale 128.0 --workers 4 ./
### Multi-process training with FP16_Optimizer, dynamic loss scaling
$ python -m apex.parallel.multiproc main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --dynamic-loss-scale --workers 4 ./
$ python -m torch.distributed.launch --nproc_per_node=NUM_GPUS main_fp16_optimizer.py -a resnet50 --fp16 --b 256 --dynamic-loss-scale --workers 4 ./
```
## Usage for `main.py` and `main_fp16_optimizer.py`
```bash
usage: main.py [-h] [--arch ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N]
[--lr LR] [--momentum M] [--weight-decay W] [--print-freq N]
[--resume PATH] [-e] [--pretrained] [--fp16]
[--static-loss-scale STATIC_LOSS_SCALE] [--prof]
[--dist-url DIST_URL] [--dist-backend DIST_BACKEND]
[--world-size WORLD_SIZE] [--rank RANK]
DIR
PyTorch ImageNet Training
positional arguments:
DIR path to dataset
optional arguments:
-h, --help show this help message and exit
--arch ARCH, -a ARCH model architecture: alexnet | densenet121 |
densenet161 | densenet169 | densenet201 | inception_v3
| resnet101 | resnet152 | resnet18 | resnet34 |
resnet50 | squeezenet1_0 | squeezenet1_1 | vgg11 |
vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn | vgg19
| vgg19_bn (default: resnet18)
-j N, --workers N number of data loading workers (default: 4)
--epochs N number of total epochs to run
--start-epoch N manual epoch number (useful on restarts)
-b N, --batch-size N mini-batch size (default: 256)
--lr LR, --learning-rate LR
initial learning rate
--momentum M momentum
--weight-decay W, --wd W
weight decay (default: 1e-4)
--print-freq N, -p N print frequency (default: 10)
--resume PATH path to latest checkpoint (default: none)
-e, --evaluate evaluate model on validation set
--pretrained use pre-trained model
--fp16 Run model fp16 mode.
--static-loss-scale STATIC_LOSS_SCALE
Static loss scale, positive power of 2 values can improve
fp16 convergence.
--prof Only run 10 iterations for profiling.
--dist-url DIST_URL url used to set up distributed training
--dist-backend DIST_BACKEND
distributed backend
--world-size WORLD_SIZE
Number of GPUs to use. Can either be manually set or
automatically set by using 'python -m multiproc'.
--rank RANK Used for multi-process training. Can either be
manually set or automatically set by using 'python -m
multiproc'.
```
`main_fp16_optimizer.py` also accepts the optional flag
......
......@@ -66,17 +66,7 @@ parser.add_argument('--static-loss-scale', type=float, default=1,
parser.add_argument('--prof', dest='prof', action='store_true',
help='Only run 10 iterations for profiling.')
parser.add_argument('--dist-url', default='file://sync.file', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--world-size', default=1, type=int,
help='Number of GPUs to use. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--rank', default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument("--local_rank", default=0, type=int)
cudnn.benchmark = True
......@@ -102,18 +92,19 @@ args = parser.parse_args()
def main():
global best_prec1, args
args.distributed = args.world_size > 1
args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
args.gpu = 0
if args.distributed:
args.gpu = args.rank % torch.cuda.device_count()
args.world_size = 1
if args.distributed:
args.gpu = args.local_rank % torch.cuda.device_count()
torch.cuda.set_device(args.gpu)
dist.init_process_group(backend=args.dist_backend,
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
torch.distributed.init_process_group(backend='nccl',
init_method='env://')
args.world_size = torch.distributed.get_world_size()
if args.fp16:
assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
......@@ -130,7 +121,7 @@ def main():
if args.fp16:
model = network_to_half(model)
if args.distributed:
#shared param turns off bucketing in DDP, for lower latency runs this can improve perf
# shared param turns off bucketing in DDP, for lower latency runs this can improve perf
model = DDP(model, shared_param=True)
global model_params, master_params
......@@ -215,7 +206,7 @@ def main():
prec1 = validate(val_loader, model, criterion)
# remember best prec@1 and save checkpoint
if args.rank == 0:
if args.local_rank == 0:
is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1)
save_checkpoint({
......@@ -328,7 +319,7 @@ def train(train_loader, model, criterion, optimizer, epoch):
end = time.time()
input, target = prefetcher.next()
if args.rank == 0 and i % args.print_freq == 0 and i > 1:
if args.local_rank == 0 and i % args.print_freq == 0 and i > 1:
print('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Speed {3:.3f} ({4:.3f})\t'
......@@ -387,7 +378,7 @@ def validate(val_loader, model, criterion):
batch_time.update(time.time() - end)
end = time.time()
if args.rank == 0 and i % args.print_freq == 0:
if args.local_rank == 0 and i % args.print_freq == 0:
print('Test: [{0}/{1}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Speed {2:.3f} ({3:.3f})\t'
......
......@@ -69,17 +69,7 @@ parser.add_argument('--dynamic-loss-scale', action='store_true',
parser.add_argument('--prof', dest='prof', action='store_true',
help='Only run 10 iterations for profiling.')
parser.add_argument('--dist-url', default='file://sync.file', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--world-size', default=1, type=int,
help='Number of GPUs to use. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--rank', default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument("--local_rank", default=0, type=int)
cudnn.benchmark = True
......@@ -105,18 +95,18 @@ args = parser.parse_args()
def main():
global best_prec1, args
args.distributed = args.world_size > 1
args.distributed = False
if 'WORLD_SIZE' in os.environ:
args.distributed = int(os.environ['WORLD_SIZE']) > 1
args.gpu = 0
if args.distributed:
args.gpu = args.rank % torch.cuda.device_count()
args.gpu = args.local_rank % torch.cuda.device_count()
if args.distributed:
torch.cuda.set_device(args.gpu)
dist.init_process_group(backend=args.dist_backend,
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
torch.distributed.init_process_group(backend='nccl',
init_method='env://')
if args.fp16:
assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
......@@ -137,7 +127,7 @@ def main():
if args.fp16:
model = network_to_half(model)
if args.distributed:
#shared param turns off bucketing in DDP, for lower latency runs this can improve perf
# shared param turns off bucketing in DDP, for lower latency runs this can improve perf
model = DDP(model, shared_param=True)
# define loss function (criterion) and optimizer
......@@ -220,7 +210,7 @@ def main():
prec1 = validate(val_loader, model, criterion)
# remember best prec@1 and save checkpoint
if args.rank == 0:
if args.local_rank == 0:
is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1)
save_checkpoint({
......@@ -325,7 +315,7 @@ def train(train_loader, model, criterion, optimizer, epoch):
end = time.time()
input, target = prefetcher.next()
if args.rank == 0 and i % args.print_freq == 0 and i > 1:
if args.local_rank == 0 and i % args.print_freq == 0 and i > 1:
print('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Speed {3:.3f} ({4:.3f})\t'
......@@ -334,8 +324,8 @@ def train(train_loader, model, criterion, optimizer, epoch):
'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
epoch, i, len(train_loader),
args.world_size * args.batch_size / batch_time.val,
args.world_size * args.batch_size / batch_time.avg,
torch.distributed.get_world_size() * args.batch_size / batch_time.val,
torch.distributed.get_world_size() * args.batch_size / batch_time.avg,
batch_time=batch_time,
data_time=data_time, loss=losses, top1=top1, top5=top5))
......@@ -384,7 +374,7 @@ def validate(val_loader, model, criterion):
batch_time.update(time.time() - end)
end = time.time()
if args.rank == 0 and i % args.print_freq == 0:
if args.local_rank == 0 and i % args.print_freq == 0:
print('Test: [{0}/{1}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Speed {2:.3f} ({3:.3f})\t'
......@@ -392,8 +382,8 @@ def validate(val_loader, model, criterion):
'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
i, len(val_loader),
args.world_size * args.batch_size / batch_time.val,
args.world_size * args.batch_size / batch_time.avg,
torch.distributed.get_world_size() * args.batch_size / batch_time.val,
torch.distributed.get_world_size() * args.batch_size / batch_time.avg,
batch_time=batch_time, loss=losses,
top1=top1, top5=top5))
......@@ -455,7 +445,7 @@ def accuracy(output, target, topk=(1,)):
def reduce_tensor(tensor):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.reduce_op.SUM)
rt /= args.world_size
rt /= torch.distributed.get_world_size()
return rt
if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment